On Tue, Dec 27, 2016 at 1:36 PM, Mithun Cy <[email protected]> wrote:
Oops, patch number should be 08 re-attaching same after renaming.
--
Thanks and Regards
Mithun C Y
EnterpriseDB: http://www.enterprisedb.com
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 46df589..c45b3f0 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,19 +32,13 @@ _hash_doinsert(Relation rel, IndexTuple itup)
Buffer bucket_buf;
Buffer metabuf;
HashMetaPage metap;
- BlockNumber blkno;
- BlockNumber oldblkno;
- bool retry;
+ HashMetaPage usedmetap;
Page metapage;
Page page;
HashPageOpaque pageopaque;
Size itemsz;
bool do_expand;
uint32 hashkey;
- Bucket bucket;
- uint32 maxbucket;
- uint32 highmask;
- uint32 lowmask;
/*
* Get the hash key for the item (it's stored in the index tuple itself).
@@ -57,10 +51,15 @@ _hash_doinsert(Relation rel, IndexTuple itup)
* need to be consistent */
restart_insert:
- /* Read the metapage */
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+
+ /*
+ * Load the metapage. No need to lock as of now because we only access
+ * page header element pd_pagesize_version in HashMaxItemSize(), this
+ * element is constant and will not move while accessing. But we hold the
+ * pin so we can use the metabuf while writing into to it below.
+ */
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
metapage = BufferGetPage(metabuf);
- metap = HashPageGetMeta(metapage);
/*
* Check whether the item can fit on a hash page at all. (Eventually, we
@@ -76,59 +75,7 @@ restart_insert:
itemsz, HashMaxItemSize(metapage)),
errhint("Values larger than a buffer page cannot be indexed.")));
- oldblkno = InvalidBlockNumber;
- retry = false;
-
- /*
- * Loop until we get a lock on the correct target bucket.
- */
- for (;;)
- {
- /*
- * Compute the target bucket number, and convert to block number.
- */
- bucket = _hash_hashkey2bucket(hashkey,
- metap->hashm_maxbucket,
- metap->hashm_highmask,
- metap->hashm_lowmask);
-
- blkno = BUCKET_TO_BLKNO(metap, bucket);
-
- /*
- * Copy bucket mapping info now; refer the comment in
- * _hash_expandtable where we copy this information before calling
- * _hash_splitbucket to see why this is okay.
- */
- maxbucket = metap->hashm_maxbucket;
- highmask = metap->hashm_highmask;
- lowmask = metap->hashm_lowmask;
-
- /* Release metapage lock, but keep pin. */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
- /*
- * If the previous iteration of this loop locked the primary page of
- * what is still the correct target bucket, we are done. Otherwise,
- * drop any old lock before acquiring the new one.
- */
- if (retry)
- {
- if (oldblkno == blkno)
- break;
- _hash_relbuf(rel, buf);
- }
-
- /* Fetch and lock the primary bucket page for the target bucket */
- buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-
- /*
- * Reacquire metapage lock and check that no bucket split has taken
- * place while we were awaiting the bucket lock.
- */
- LockBuffer(metabuf, BUFFER_LOCK_SHARE);
- oldblkno = blkno;
- retry = true;
- }
+ buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_WRITE, &usedmetap);
/* remember the primary bucket buffer to release the pin on it at end. */
bucket_buf = buf;
@@ -152,7 +99,9 @@ restart_insert:
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
- maxbucket, highmask, lowmask);
+ usedmetap->hashm_maxbucket,
+ usedmetap->hashm_highmask,
+ usedmetap->hashm_lowmask);
/* release the pin on old and meta buffer. retry for insert. */
_hash_dropbuf(rel, buf);
@@ -225,6 +174,7 @@ restart_insert:
*/
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+ metap = HashPageGetMeta(metapage);
metap->hashm_ntuples += 1;
/* Make sure this stays in sync with _hash_expandtable() */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 45e184c..c726909 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
pg = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
- pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+ /*
+ * Setting hasho_prevblkno of bucket page with latest maxbucket number
+ * to indicate bucket has been initialized and need to reconstruct
+ * HashMetaCache if it is older.
+ */
+ pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
pageopaque->hasho_nextblkno = InvalidBlockNumber;
pageopaque->hasho_bucket = i;
pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -845,6 +851,12 @@ _hash_splitbucket(Relation rel,
*/
oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
+ /*
+ * Setting hasho_prevblkno of bucket page with latest maxbucket number to
+ * indicate bucket has been split and need to reconstruct HashMetaCache.
+ * Below same is done for new bucket page.
+ */
+ oopaque->hasho_prevblkno = maxbucket;
npage = BufferGetPage(nbuf);
/*
@@ -852,7 +864,7 @@ _hash_splitbucket(Relation rel,
* split is in progress.
*/
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
- nopaque->hasho_prevblkno = InvalidBlockNumber;
+ nopaque->hasho_prevblkno = maxbucket;
nopaque->hasho_nextblkno = InvalidBlockNumber;
nopaque->hasho_bucket = nbucket;
nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
@@ -1191,3 +1203,124 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
hash_destroy(tidhtab);
}
+
+
+/*
+ * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ * hashkey.
+ *
+ * Bucket Pages do not move or get removed once they are allocated. This give
+ * us an opportunity to use the previously saved metapage contents to reach
+ * the target bucket buffer, instead of every time reading from the metapage
+ * buffer. This saves one buffer access everytime we want to reach the target
+ * bucket buffer, which is very helpful savings in bufmgr traffic and
+ * contention.
+ *
+ * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ * bucket buffer has to be locked for reading or writing.
+ *
+ * The out parameter cachedmetap is set with metapage contents used for
+ * hashkey to bucket buffer mapping. Some callers need this info to reach the
+ * old bucket in case of bucket split, see @_hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel, int access,
+ HashMetaPage *cachedmetap)
+{
+ HashMetaPage metap;
+ Buffer buf;
+ Buffer metabuf = InvalidBuffer;
+ Page page;
+ Bucket bucket;
+ BlockNumber blkno;
+ HashPageOpaque opaque;
+
+ if (rel->rd_amcache != NULL)
+ {
+ metap = (HashMetaPage) rel->rd_amcache;
+ }
+ else
+ {
+ /* Read the metapage */
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+ page = BufferGetPage(metabuf);
+ metap = HashPageGetMeta(page);
+
+ /* Cache the metapage data for next time. */
+ rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(HashMetaPageData));
+ memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData));
+ metap = (HashMetaPage) rel->rd_amcache;
+
+ /* Release metapage lock, but keep pin. */
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ /*
+ * Loop until we get a lock on the correct target bucket.
+ */
+ for (;;)
+ {
+ /*
+ * Compute the target bucket number, and convert to block number.
+ */
+ bucket = _hash_hashkey2bucket(hashkey,
+ metap->hashm_maxbucket,
+ metap->hashm_highmask,
+ metap->hashm_lowmask);
+
+ blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+ /* Fetch the primary bucket page for the bucket */
+ buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+ page = BufferGetPage(buf);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+
+ /*
+ * Check if this bucket is split after we have cached the hash meta
+ * data. To do this we need to check whether cached maxbucket number
+ * is less than or equal to maxbucket number stored in bucket page,
+ * which was set with that times maxbucket number during bucket page
+ * splits. In case of upgrade hashno_prevblkno of old bucket page will
+ * be set with InvalidBlockNumber. And as of now maximum value the
+ * hashm_maxbucket can take is 1 less than InvalidBlockNumber (see
+ * _hash_expandtable). So an explicit check for InvalidBlockNumber in
+ * hasho_prevblkno will tell whether current bucket has been split
+ * after caching hash meta data.
+ */
+ if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+ opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+ {
+ /* Ok now we have the right bucket proceed to search in it. */
+ break;
+ }
+
+ /* First drop any locks held on bucket buffers. */
+ _hash_relbuf(rel, buf);
+
+ /* Cached meta data is old try again updating it. */
+ if (BufferIsInvalid(metabuf))
+ metabuf =
+ _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+ else
+ LockBuffer(metabuf, BUFFER_LOCK_SHARE);
+
+ metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+ /* Cache the metapage data for next time. */
+ memcpy(rel->rd_amcache, metap, sizeof(HashMetaPageData));
+ metap = (HashMetaPage) rel->rd_amcache;
+
+ /* Release metapage lock, but keep pin. */
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ /* done with the metapage */
+ if (!BufferIsInvalid(metabuf))
+ _hash_dropbuf(rel, metabuf);
+
+ if (cachedmetap)
+ *cachedmetap = metap;
+ return buf;
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 913b87c..cdc54e1 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -152,6 +152,11 @@ _hash_readprev(IndexScanDesc scan,
_hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
+
+ /* If it is a bucket page there will not be a prevblkno. */
+ if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE)
+ return;
+
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno))
@@ -216,13 +221,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
uint32 hashkey;
Bucket bucket;
BlockNumber blkno;
- BlockNumber oldblkno = InvalidBuffer;
- bool retry = false;
Buffer buf;
- Buffer metabuf;
Page page;
HashPageOpaque opaque;
- HashMetaPage metap;
IndexTuple itup;
ItemPointer current;
OffsetNumber offnum;
@@ -277,56 +278,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
so->hashso_sk_hash = hashkey;
- /* Read the metapage */
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
- page = BufferGetPage(metabuf);
- metap = HashPageGetMeta(page);
-
- /*
- * Loop until we get a lock on the correct target bucket.
- */
- for (;;)
- {
- /*
- * Compute the target bucket number, and convert to block number.
- */
- bucket = _hash_hashkey2bucket(hashkey,
- metap->hashm_maxbucket,
- metap->hashm_highmask,
- metap->hashm_lowmask);
-
- blkno = BUCKET_TO_BLKNO(metap, bucket);
-
- /* Release metapage lock, but keep pin. */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
- /*
- * If the previous iteration of this loop locked what is still the
- * correct target bucket, we are done. Otherwise, drop any old lock
- * and lock what now appears to be the correct bucket.
- */
- if (retry)
- {
- if (oldblkno == blkno)
- break;
- _hash_relbuf(rel, buf);
- }
-
- /* Fetch the primary bucket page for the bucket */
- buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-
- /*
- * Reacquire metapage lock and check that no bucket split has taken
- * place while we were awaiting the bucket lock.
- */
- LockBuffer(metabuf, BUFFER_LOCK_SHARE);
- oldblkno = blkno;
- retry = true;
- }
-
- /* done with the metapage */
- _hash_dropbuf(rel, metabuf);
-
+ buf = _hash_getbucketbuf_from_hashkey(hashkey, rel, HASH_READ, NULL);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index bc08f81..fe98157 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -60,6 +60,14 @@ typedef uint32 Bucket;
typedef struct HashPageOpaqueData
{
+ /*
+ * If this is an ovfl page this stores previous ovfl (or bucket) blkno.
+ * Else if this is a bucket page we use this for a special purpose. We
+ * store hashm_maxbucket value, whenever this page is initialized or
+ * split. So this helps us to know whether the bucket has been split after
+ * caching the some of the meta page data. See _hash_doinsert(),
+ * _hash_first() to know how to use same.
+ */
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
BlockNumber hasho_nextblkno; /* next ovfl blkno */
Bucket hasho_bucket; /* bucket number this pg belongs to */
@@ -327,6 +335,10 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
int access, int flags);
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
BlockNumber blkno, int flags);
+extern Buffer
+_hash_getbucketbuf_from_hashkey(uint32 hashkey, Relation rel,
+ int access,
+ HashMetaPage *cachedmetap);
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
ForkNumber forkNum);
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers