On Thu, Dec 22, 2016 at 12:17 PM, Amit Kapila <amit.kapil...@gmail.com> wrote:
> On Wed, Dec 21, 2016 at 9:26 PM, Robert Haas <robertmh...@gmail.com> wrote:
>> On Tue, Dec 20, 2016 at 2:25 PM, Mithun Cy <mithun...@enterprisedb.com> 
>> wrote:
>>> -- I think if it is okay, I can document same for each member of 
>>> HashMetaPageData whether to read from cached from meta page or directly 
>>> from current meta page. Below briefly I have commented for each member. If 
>>> you suggest I can go with that approach, I will produce a neat patch for 
>>> same.
>>
>> Plain text emails are preferred on this list.

Sorry, I have set the mail to plain text mode now.

> I think this will make metpage cache access somewhat
> similar to what we have in btree where we use cache to access
> rootpage.  Will something like that address your concern?

Thanks, just like _bt_getroot I am introducing a new function
_hash_getbucketbuf_from_hashkey() which will give the target bucket
buffer for the given hashkey. This will use the cached metapage
contents instead of reading meta page buffer if cached data is valid.
There are 2 places which can use this service 1. _hash_first and 2.
_hash_doinsert.

-- 
Thanks and Regards
Mithun C Y
EnterpriseDB: http://www.enterprisedb.com
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 46df589..c45b3f0 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,19 +32,13 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	Buffer		bucket_buf;
 	Buffer		metabuf;
 	HashMetaPage metap;
-	BlockNumber blkno;
-	BlockNumber oldblkno;
-	bool		retry;
+	HashMetaPage usedmetap;
 	Page		metapage;
 	Page		page;
 	HashPageOpaque pageopaque;
 	Size		itemsz;
 	bool		do_expand;
 	uint32		hashkey;
-	Bucket		bucket;
-	uint32		maxbucket;
-	uint32		highmask;
-	uint32		lowmask;
 
 	/*
 	 * Get the hash key for the item (it's stored in the index tuple itself).
@@ -57,10 +51,15 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 								 * need to be consistent */
 
 restart_insert:
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+
+	/*
+	 * Load the metapage. No need to lock as of now because we only access
+	 * page header element pd_pagesize_version in HashMaxItemSize(), this
+	 * element is constant and will not move while accessing. But we hold the
+	 * pin so we can use the metabuf while writing into to it below.
+	 */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
 	metapage = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(metapage);
 
 	/*
 	 * Check whether the item can fit on a hash page at all. (Eventually, we
@@ -76,59 +75,7 @@ restart_insert:
 						itemsz, HashMaxItemSize(metapage)),
 			errhint("Values larger than a buffer page cannot be indexed.")));
 
-	oldblkno = InvalidBlockNumber;
-	retry = false;
-
-	/*
-	 * Loop until we get a lock on the correct target bucket.
-	 */
-	for (;;)
-	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
-
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-		/*
-		 * Copy bucket mapping info now; refer the comment in
-		 * _hash_expandtable where we copy this information before calling
-		 * _hash_splitbucket to see why this is okay.
-		 */
-		maxbucket = metap->hashm_maxbucket;
-		highmask = metap->hashm_highmask;
-		lowmask = metap->hashm_lowmask;
-
-		/* Release metapage lock, but keep pin. */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-		/*
-		 * If the previous iteration of this loop locked the primary page of
-		 * what is still the correct target bucket, we are done.  Otherwise,
-		 * drop any old lock before acquiring the new one.
-		 */
-		if (retry)
-		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
-		}
-
-		/* Fetch and lock the primary bucket page for the target bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-		oldblkno = blkno;
-		retry = true;
-	}
+	buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_WRITE, &usedmetap);
 
 	/* remember the primary bucket buffer to release the pin on it at end. */
 	bucket_buf = buf;
@@ -152,7 +99,9 @@ restart_insert:
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 
 		_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
-						   maxbucket, highmask, lowmask);
+						   usedmetap->hashm_maxbucket,
+						   usedmetap->hashm_highmask,
+						   usedmetap->hashm_lowmask);
 
 		/* release the pin on old and meta buffer.  retry for insert. */
 		_hash_dropbuf(rel, buf);
@@ -225,6 +174,7 @@ restart_insert:
 	 */
 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
+	metap = HashPageGetMeta(metapage);
 	metap->hashm_ntuples += 1;
 
 	/* Make sure this stays in sync with _hash_expandtable() */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 45e184c..c726909 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
 		pg = BufferGetPage(buf);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
-		pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+		/*
+		 * Setting hasho_prevblkno of bucket page with latest maxbucket number
+		 * to indicate bucket has been initialized and need to reconstruct
+		 * HashMetaCache if it is older.
+		 */
+		pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
 		pageopaque->hasho_nextblkno = InvalidBlockNumber;
 		pageopaque->hasho_bucket = i;
 		pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -845,6 +851,12 @@ _hash_splitbucket(Relation rel,
 	 */
 	oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
 
+	/*
+	 * Setting hasho_prevblkno of bucket page with latest maxbucket number to
+	 * indicate bucket has been split and need to reconstruct HashMetaCache.
+	 * Below same is done for new bucket page.
+	 */
+	oopaque->hasho_prevblkno = maxbucket;
 	npage = BufferGetPage(nbuf);
 
 	/*
@@ -852,7 +864,7 @@ _hash_splitbucket(Relation rel,
 	 * split is in progress.
 	 */
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
-	nopaque->hasho_prevblkno = InvalidBlockNumber;
+	nopaque->hasho_prevblkno = maxbucket;
 	nopaque->hasho_nextblkno = InvalidBlockNumber;
 	nopaque->hasho_bucket = nbucket;
 	nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
@@ -1191,3 +1203,124 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
 	LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 	hash_destroy(tidhtab);
 }
+
+
+/*
+ *	_hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ *										 hashkey.
+ *
+ *	Bucket Pages do not move or get removed once they are allocated. This give
+ *	us an opportunity to use the previously saved metapage contents to reach
+ *	the target bucket buffer, instead of every time reading from the metapage
+ *	buffer. This saves one buffer access everytime we want to reach the target
+ *	bucket buffer, which is very helpful savings in bufmgr traffic and
+ *	contention.
+ *
+ *	The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ *	bucket buffer has to be locked for reading or writing.
+ *
+ *	The out parameter cachedmetap is set with metapage contents used for
+ *	hashkey to bucket buffer mapping. Some callers need this info to reach the
+ *	old bucket in case of bucket split, see @_hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel, int access,
+								HashMetaPage *cachedmetap)
+{
+	HashMetaPage metap;
+	Buffer		buf;
+	Buffer		metabuf = InvalidBuffer;
+	Page		page;
+	Bucket		bucket;
+	BlockNumber blkno;
+	HashPageOpaque opaque;
+
+	if (rel->rd_amcache != NULL)
+	{
+		metap = (HashMetaPage) rel->rd_amcache;
+	}
+	else
+	{
+		/* Read the metapage */
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+
+		/* Cache the metapage data for next time. */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(HashMetaPageData));
+		memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData));
+		metap = (HashMetaPage) rel->rd_amcache;
+
+		/* Release metapage lock, but keep pin. */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+	}
+
+	/*
+	 * Loop until we get a lock on the correct target bucket.
+	 */
+	for (;;)
+	{
+		/*
+		 * Compute the target bucket number, and convert to block number.
+		 */
+		bucket = _hash_hashkey2bucket(hashkey,
+									  metap->hashm_maxbucket,
+									  metap->hashm_highmask,
+									  metap->hashm_lowmask);
+
+		blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+		/* Fetch the primary bucket page for the bucket */
+		buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->hasho_bucket == bucket);
+
+		/*
+		 * Check if this bucket is split after we have cached the hash meta
+		 * data. To do this we need to check whether cached maxbucket number
+		 * is less than or equal to maxbucket number stored in bucket page,
+		 * which was set with that times maxbucket number during bucket page
+		 * splits. In case of upgrade hashno_prevblkno of old bucket page will
+		 * be set with InvalidBlockNumber. And as of now maximum value the
+		 * hashm_maxbucket can take is 1 less than InvalidBlockNumber (see
+		 * _hash_expandtable). So an explicit check for InvalidBlockNumber in
+		 * hasho_prevblkno will tell whether current bucket has been split
+		 * after caching hash meta data.
+		 */
+		if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+			opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+		{
+			/* Ok now we have the right bucket proceed to search in it. */
+			break;
+		}
+
+		/* First drop any locks held on bucket buffers. */
+		_hash_relbuf(rel, buf);
+
+		/* Cached meta data is old try again updating it. */
+		if (BufferIsInvalid(metabuf))
+			metabuf =
+				_hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+		else
+			LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+
+		metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+		/* Cache the metapage data for next time. */
+		memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData));
+		metap = (HashMetaPage) rel->rd_amcache;
+
+		/* Release metapage lock, but keep pin. */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+	}
+
+	/* done with the metapage */
+	if (!BufferIsInvalid(metabuf))
+		_hash_dropbuf(rel, metabuf);
+
+	if (cachedmetap)
+		*cachedmetap = metap;
+	return buf;
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 913b87c..cdc54e1 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -152,6 +152,11 @@ _hash_readprev(IndexScanDesc scan,
 		_hash_relbuf(rel, *bufp);
 
 	*bufp = InvalidBuffer;
+
+	/* If it is a bucket page there will not be a prevblkno. */
+	if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE)
+		return;
+
 	/* check for interrupts while we're not holding any buffer lock */
 	CHECK_FOR_INTERRUPTS();
 	if (BlockNumberIsValid(blkno))
@@ -216,13 +221,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	uint32		hashkey;
 	Bucket		bucket;
 	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBuffer;
-	bool		retry = false;
 	Buffer		buf;
-	Buffer		metabuf;
 	Page		page;
 	HashPageOpaque opaque;
-	HashMetaPage metap;
 	IndexTuple	itup;
 	ItemPointer current;
 	OffsetNumber offnum;
@@ -277,56 +278,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
 	so->hashso_sk_hash = hashkey;
 
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	page = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(page);
-
-	/*
-	 * Loop until we get a lock on the correct target bucket.
-	 */
-	for (;;)
-	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
-
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-		/* Release metapage lock, but keep pin. */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-		/*
-		 * If the previous iteration of this loop locked what is still the
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
-		 * and lock what now appears to be the correct bucket.
-		 */
-		if (retry)
-		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
-		}
-
-		/* Fetch the primary bucket page for the bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-		oldblkno = blkno;
-		retry = true;
-	}
-
-	/* done with the metapage */
-	_hash_dropbuf(rel, metabuf);
-
+	buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_READ, NULL);
 	page = BufferGetPage(buf);
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	Assert(opaque->hasho_bucket == bucket);
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index bc08f81..fe98157 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -60,6 +60,14 @@ typedef uint32 Bucket;
 
 typedef struct HashPageOpaqueData
 {
+	/*
+	 * If this is an ovfl page this stores previous ovfl (or bucket) blkno.
+	 * Else if this is a bucket page we use this for a special purpose. We
+	 * store hashm_maxbucket value, whenever this page is initialized or
+	 * split. So this helps us to know whether the bucket has been split after
+	 * caching the some of the meta page data. See _hash_doinsert(),
+	 * _hash_first() to know how to use same.
+	 */
 	BlockNumber hasho_prevblkno;	/* previous ovfl (or bucket) blkno */
 	BlockNumber hasho_nextblkno;	/* next ovfl blkno */
 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
@@ -327,6 +335,10 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
 			 int access, int flags);
 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
 								   BlockNumber blkno, int flags);
+extern Buffer
+_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel,
+								int access,
+								HashMetaPage *cachedmetap);
 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
 				ForkNumber forkNum);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to