From 34b10212e243696d37bfc8ef55b84777b60faa69 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilipkumar@localhost.localdomain>
Date: Sun, 26 Jul 2020 18:00:35 +0530
Subject: [PATCH v1] POC:Parallel Bitmap Index Scan

This patch enable parallel bitmap index scan path under parallel heap
scan path.  As of now, only one worker is allowed to do the bitmap index
scan and prepare a shared tidmap.  With this patch, if underlying index AM
support parallel scan then multiple worker will be allowed to create their
part for tidbitmap and once all the worker are ready with their tidbitmap
then all those bitmaps will be merged and created a shared bitmap.  After
that remaining part will be same as now, i.e. one worker will create a
shared iterator which will be used by all the worker to parallel scan the heap.
---
 src/backend/executor/execParallel.c           |  20 ++++
 src/backend/executor/nodeBitmapHeapscan.c     |  81 ++++++++++++--
 src/backend/executor/nodeBitmapIndexscan.c    | 147 +++++++++++++++++++++++---
 src/backend/nodes/tidbitmap.c                 | 110 ++++++++++++++++---
 src/backend/optimizer/path/indxpath.c         |  59 +++++++++--
 src/backend/optimizer/plan/createplan.c       |   2 +-
 src/backend/optimizer/util/pathnode.c         |   6 +-
 src/include/executor/nodeBitmapIndexscan.h    |   9 ++
 src/include/lib/simplehash.h                  |  25 +++++
 src/include/nodes/execnodes.h                 |  10 +-
 src/include/nodes/tidbitmap.h                 |   6 +-
 src/test/regress/expected/select_parallel.out |   6 +-
 12 files changed, 425 insertions(+), 56 deletions(-)

diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 382e78f..469371e 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -272,6 +272,11 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 				ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
 									   e->pcxt);
 			break;
+		case T_BitmapIndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapIndexEstimate((BitmapIndexScanState *)planstate,
+										e->pcxt);
+			break;
 		case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
 				ExecHashJoinEstimate((HashJoinState *) planstate,
@@ -492,6 +497,11 @@ ExecParallelInitializeDSM(PlanState *planstate,
 				ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
 											d->pcxt);
 			break;
+		case T_BitmapIndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapIndexInitializeDSM((BitmapIndexScanState *) planstate,
+											 d->pcxt);
+			break;
 		case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
 				ExecHashJoinInitializeDSM((HashJoinState *) planstate,
@@ -981,6 +991,11 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 				ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate,
 											  pcxt);
 			break;
+		case T_BitmapIndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapIndexReInitializeDSM((BitmapIndexScanState *) planstate,
+											  pcxt);
+			break;	
 		case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
 				ExecHashJoinReInitializeDSM((HashJoinState *) planstate,
@@ -1328,6 +1343,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 				ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate,
 											   pwcxt);
 			break;
+		case T_BitmapIndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapIndexInitializeWorker((BitmapIndexScanState *)planstate,
+											   pwcxt);
+			break;
 		case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
 				ExecHashJoinInitializeWorker((HashJoinState *) planstate,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 5a5c410..e97f7a0 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -37,6 +37,7 @@
 
 #include <math.h>
 
+#include "access/genam.h"
 #include "access/relscan.h"
 #include "access/tableam.h"
 #include "access/transam.h"
@@ -131,29 +132,87 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		else
 		{
 			/*
+			 * If underlying node is parallel aware then all the worker will
+			 * do the parallel index scan and prepare the their own local
+			 * bitmap and the bitmap will be merged and a shared common bitmap
+			 * will be created.
+			 */
+			if (outerPlanState(node)->plan->parallel_aware)
+			{
+				/* All the workers will attach to the barrier */
+				BarrierAttach(&pstate->barrier);
+
+				/*
+				 * If the shared tbm is already created before we start then
+				 * there is no point in continuing.  So just detach from the
+				 * barrier and return NULL slot.
+				 */
+				if (DsaPointerIsValid(pstate->tbm_shared))
+				{
+					BarrierDetach(&pstate->barrier);
+					return NULL;					
+				}
+				else
+				{
+					tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+					if (!tbm || !IsA(tbm, TIDBitmap))
+						elog(ERROR, "unrecognized result from subplan");
+
+					/* Merge bitmap to a common shared bitmap */
+					SpinLockAcquire(&pstate->mutex);
+					tbm_merge(tbm, &pstate->tbm_shared, &pstate->pt_shared);
+					SpinLockRelease(&pstate->mutex);
+
+					/*
+					 * Don't preceed for preparing the shared iterator until
+					 * all worker have merged their bitmap to the shared
+					 * bitmap.
+					 */
+					BarrierArriveAndWait(&pstate->barrier, 0);
+					BarrierDetach(&pstate->barrier);
+				}
+			}
+
+			/*
 			 * The leader will immediately come out of the function, but
-			 * others will be blocked until leader populates the TBM and wakes
-			 * them up.
+			 * others will be blocked until leader initialized the shared
+			 * iterator.
 			 */
 			if (BitmapShouldInitializeSharedState(pstate))
 			{
-				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
-				if (!tbm || !IsA(tbm, TIDBitmap))
-					elog(ERROR, "unrecognized result from subplan");
+				/*
+				 * If underlying node is parallel aware then we have already
+				 * generated the bitmap and merged so only do the shared
+				 * iterator initialization.  Otherwise first build the bitmap.
+				 */
+				if (outerPlanState(node)->plan->parallel_aware)
+				{
+					/* If shared tbm is not initialized tthen nothing to do */
+					if (!DsaPointerIsValid(pstate->tbm_shared))
+						return NULL;
 
-				node->tbm = tbm;
+					tbm = dsa_get_address(dsa, pstate->tbm_shared);
+				}
+				else
+				{
+					tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+					if (!tbm || !IsA(tbm, TIDBitmap))
+						elog(ERROR, "unrecognized result from subplan");
+					node->tbm = tbm;
+				}
 
 				/*
 				 * Prepare to iterate over the TBM. This will return the
 				 * dsa_pointer of the iterator state which will be used by
 				 * multiple processes to iterate jointly.
 				 */
-				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
+				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm, dsa,
+													pstate->pt_shared);
 #ifdef USE_PREFETCH
 				if (node->prefetch_maximum > 0)
 				{
-					pstate->prefetch_iterator =
-						tbm_prepare_shared_iterate(tbm);
+					pstate->prefetch_iterator = tbm_prepare_shared_iterate(tbm,
+													dsa, pstate->pt_shared);
 
 					/*
 					 * We don't need the mutex here as we haven't yet woke up
@@ -896,6 +955,7 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 	pstate->state = BM_INITIAL;
 
 	ConditionVariableInit(&pstate->cv);
+	BarrierInit(&pstate->barrier, 0);
 	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
 
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
@@ -920,6 +980,9 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
 		return;
 
 	pstate->state = BM_INITIAL;
+	pstate->tbm_shared = InvalidDsaPointer;
+	pstate->pt_shared = InvalidDsaPointer;
+	BarrierInit(&pstate->barrier, 0);
 
 	if (DsaPointerIsValid(pstate->tbmiterator))
 		tbm_free_shared_area(dsa, pstate->tbmiterator);
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 81a1208..4a88a6f 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -62,6 +62,22 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
 	 * extract necessary information from index scan node
 	 */
 	scandesc = node->biss_ScanDesc;
+	if (scandesc == NULL)
+	{
+		scandesc = node->biss_ScanDesc =
+			index_beginscan_bitmap(node->biss_RelationDesc,
+								   node->ss.ps.state->es_snapshot,
+								   node->biss_NumScanKeys);
+		/*
+		 * If no run-time keys to calculate, go ahead and pass the scankeys to the
+		 * index AM.
+		 */
+		if (node->biss_NumRuntimeKeys == 0 &&
+			node->biss_NumArrayKeys == 0)
+			index_rescan(node->biss_ScanDesc,
+						 node->biss_ScanKeys, node->biss_NumScanKeys,
+						 NULL, 0);		
+	}	
 
 	/*
 	 * If we have runtime keys and they've not already been set up, do it now.
@@ -162,7 +178,7 @@ ExecReScanBitmapIndexScan(BitmapIndexScanState *node)
 		node->biss_RuntimeKeysReady = true;
 
 	/* reset index scan */
-	if (node->biss_RuntimeKeysReady)
+	if (node->biss_ScanDesc && node->biss_RuntimeKeysReady)
 		index_rescan(node->biss_ScanDesc,
 					 node->biss_ScanKeys, node->biss_NumScanKeys,
 					 NULL, 0);
@@ -232,8 +248,8 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
 	 * ancestor BitmapHeapScan node is holding AccessShareLock (or better) on
 	 * the heap relation throughout the execution of the plan tree.
 	 */
-
-	indexstate->ss.ss_currentRelation = NULL;
+	indexstate->ss.ss_currentRelation =
+		ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
 	indexstate->ss.ss_currentScanDesc = NULL;
 
 	/*
@@ -308,23 +324,124 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags)
 	/*
 	 * Initialize scan descriptor.
 	 */
-	indexstate->biss_ScanDesc =
-		index_beginscan_bitmap(indexstate->biss_RelationDesc,
-							   estate->es_snapshot,
-							   indexstate->biss_NumScanKeys);
+	if (!node->scan.plan.parallel_aware)
+	{
+		indexstate->biss_ScanDesc =
+			index_beginscan_bitmap(indexstate->biss_RelationDesc,
+								   estate->es_snapshot,
+								   indexstate->biss_NumScanKeys);
+
+		/*
+		 * If no run-time keys to calculate, go ahead and pass the scankeys to
+		 * the index AM.
+		 */
+		if (indexstate->biss_NumRuntimeKeys == 0 &&
+			indexstate->biss_NumArrayKeys == 0)
+			index_rescan(indexstate->biss_ScanDesc,
+						 indexstate->biss_ScanKeys,
+						 indexstate->biss_NumScanKeys,
+						 NULL, 0);
+	}
 
 	/*
-	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
-	 * index AM.
+	 * all done.
 	 */
-	if (indexstate->biss_NumRuntimeKeys == 0 &&
-		indexstate->biss_NumArrayKeys == 0)
-		index_rescan(indexstate->biss_ScanDesc,
-					 indexstate->biss_ScanKeys, indexstate->biss_NumScanKeys,
+	return indexstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanEstimate
+ *
+ *		Compute the amount of space we'll need in the parallel
+ *		query DSM, and inform pcxt->estimator about our needs.
+ * ----------------------------------------------------------------
+ */
+void ExecBitmapIndexEstimate(BitmapIndexScanState *node,
+							 ParallelContext *pcxt)
+{
+	EState *estate = node->ss.ps.state;
+
+	node->biss_PscanLen = index_parallelscan_estimate(node->biss_RelationDesc,
+													  estate->es_snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, node->biss_PscanLen);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexScanInitializeDSM
+ *
+ *		Set up a parallel index scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapIndexInitializeDSM(BitmapIndexScanState *node,
+								  ParallelContext *pcxt)
+{
+	EState *estate = node->ss.ps.state;
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_allocate(pcxt->toc, node->biss_PscanLen);
+	index_parallelscan_initialize(node->ss.ss_currentRelation,
+								  node->biss_RelationDesc,
+								  estate->es_snapshot,
+								  piscan);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
+	node->biss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->biss_RelationDesc,
+								 node->biss_NumScanKeys,
+								 0,
+								 piscan);
+
+	/*
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
+	 */
+	if (node->biss_NumRuntimeKeys == 0 || node->biss_RuntimeKeysReady)
+		index_rescan(node->biss_ScanDesc,
+					 node->biss_ScanKeys, node->biss_NumScanKeys,
 					 NULL, 0);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapIndexInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapIndexInitializeWorker(BitmapIndexScanState *node,
+									 ParallelWorkerContext *pwcxt)
+{
+	ParallelIndexScanDesc piscan;
+
+	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->biss_ScanDesc =
+		index_beginscan_parallel(node->ss.ss_currentRelation,
+								 node->biss_RelationDesc,
+								 node->biss_NumScanKeys,
+								 0,
+								 piscan);
 
 	/*
-	 * all done.
+	 * If no run-time keys to calculate or they are ready, go ahead and pass
+	 * the scankeys to the index AM.
 	 */
-	return indexstate;
+	if (node->biss_NumRuntimeKeys == 0 || node->biss_RuntimeKeysReady)
+		index_rescan(node->biss_ScanDesc,
+					 node->biss_ScanKeys, node->biss_NumScanKeys,
+					 NULL, 0);
 }
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapIndexReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapIndexReInitializeDSM(BitmapIndexScanState *node,
+							   ParallelContext *pcxt)
+{
+	index_parallelrescan(node->biss_ScanDesc);
+}
\ No newline at end of file
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 0d5056c..c3a9ab3 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -753,6 +753,78 @@ tbm_begin_iterate(TIDBitmap *tbm)
 }
 
 /*
+ * tbm_merge - merged worker's tbm to the shared tbm
+ *
+ * First worker will allocate the memory for the shared tbm and shared
+ * pagetable and copy.  The subsequent workers will merge their tbm to the
+ * shared tbm.
+ */
+void
+tbm_merge(TIDBitmap *tbm, dsa_pointer *dp_tbm, dsa_pointer *dp_pagetable)
+{
+	TIDBitmap	   *stbm;
+	pagetable_hash	*spagetable;
+
+	/* If the tbm is empty then nothing to do */
+	if (tbm_is_empty(tbm))
+		return;
+
+	/*
+	 * If we haven't yet created a shared tbm then allocate the memory for
+	 * the tbm and pagetable hash in DSA so that tthe subsequent workers can
+	 * merge their TBM to this shared TBM.
+	 */
+	if (!DsaPointerIsValid(*dp_tbm))
+	{
+		*dp_tbm = dsa_allocate0(tbm->dsa, sizeof(TIDBitmap));
+		stbm = dsa_get_address(tbm->dsa, *dp_tbm);
+		stbm->dsa = tbm->dsa;
+
+		/* Directly copy TBM to the shared TBM */
+		memcpy(stbm, tbm, sizeof(TIDBitmap));
+		
+		*dp_pagetable = dsa_allocate0(tbm->dsa, pagetable_size());
+		spagetable = dsa_get_address(tbm->dsa, *dp_pagetable);
+
+		/* If the tbm is in one page mode then convert into the shared hash */
+		if (tbm->status == TBM_ONE_PAGE)
+			tbm_create_pagetable(tbm);
+
+		/* Copy pagetable hash to the shared memory */
+		memcpy(spagetable, tbm->pagetable, pagetable_size());
+
+		/*
+		 * We have created a shared tbm and pagetable so free its memory.  We
+		 * can not directly call the tbm_free here otherwise it will free the
+		 * underlying page table data which is already in shared memory.
+		 */
+		pfree(tbm->pagetable);
+		pfree(tbm);
+	}
+	else
+	{
+		PTEntryArray *entry;
+
+		/* Get the shared TBM and pagetable hash */
+		stbm = dsa_get_address(tbm->dsa, *dp_tbm);
+		stbm->dsa = tbm->dsa;
+		spagetable = dsa_get_address(tbm->dsa, *dp_pagetable);
+		stbm->pagetable = spagetable;
+
+		/*
+		 * Get the shared pagetable data address and set its pointer in the
+		 * shared pagetable.
+		 */
+		entry = dsa_get_address(tbm->dsa, stbm->dsapagetable);
+		pagetable_set_data(spagetable, entry->ptentry, (void *) stbm);
+
+		/* Merge our TBM to the shared TBM and release its memory */
+		tbm_union(stbm, tbm);
+		tbm_free(tbm);
+	}
+}
+
+/*
  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
  *
  * The necessary shared state will be allocated from the DSA passed to
@@ -762,7 +834,8 @@ tbm_begin_iterate(TIDBitmap *tbm)
  * into pagetable array.
  */
 dsa_pointer
-tbm_prepare_shared_iterate(TIDBitmap *tbm)
+tbm_prepare_shared_iterate(TIDBitmap *tbm, dsa_area *dsa,
+						   dsa_pointer dp_pagetable)
 {
 	dsa_pointer dp;
 	TBMSharedIteratorState *istate;
@@ -770,15 +843,14 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 	PTIterationArray *ptpages = NULL;
 	PTIterationArray *ptchunks = NULL;
 
-	Assert(tbm->dsa != NULL);
 	Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
 
 	/*
 	 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
 	 * lock, this will also be used by multiple worker for shared iterate.
 	 */
-	dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
-	istate = dsa_get_address(tbm->dsa, dp);
+	dp = dsa_allocate0(dsa, sizeof(TBMSharedIteratorState));
+	istate = dsa_get_address(dsa, dp);
 
 	/*
 	 * If we're not already iterating, create and fill the sorted page lists.
@@ -799,16 +871,16 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 		 */
 		if (tbm->npages)
 		{
-			tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
+			tbm->ptpages = dsa_allocate(dsa, sizeof(PTIterationArray) +
 										tbm->npages * sizeof(int));
-			ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
+			ptpages = dsa_get_address(dsa, tbm->ptpages);
 			pg_atomic_init_u32(&ptpages->refcount, 0);
 		}
 		if (tbm->nchunks)
 		{
-			tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
+			tbm->ptchunks = dsa_allocate(dsa, sizeof(PTIterationArray) +
 										 tbm->nchunks * sizeof(int));
-			ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
+			ptchunks = dsa_get_address(dsa, tbm->ptchunks);
 			pg_atomic_init_u32(&ptchunks->refcount, 0);
 		}
 
@@ -821,8 +893,18 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 		npages = nchunks = 0;
 		if (tbm->status == TBM_HASH)
 		{
-			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+			ptbase = dsa_get_address(dsa, tbm->dsapagetable);
 
+			/*
+			 * If shared page table is valid then set it in the shared tbm
+			 * and also set the shared data to the shared pagetable.
+			 */
+			if (DsaPointerIsValid(dp_pagetable))
+			{
+				tbm->pagetable = dsa_get_address(dsa, dp_pagetable);
+				pagetable_set_data(tbm->pagetable, ptbase->ptentry, NULL);
+			}
+			
 			pagetable_start_iterate(tbm->pagetable, &i);
 			while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
 			{
@@ -843,9 +925,9 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 			 * initialize it, and directly store its index (i.e. 0) in the
 			 * page array.
 			 */
-			tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
+			tbm->dsapagetable = dsa_allocate(dsa, sizeof(PTEntryArray) +
 											 sizeof(PagetableEntry));
-			ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+			ptbase = dsa_get_address(dsa, tbm->dsapagetable);
 			memcpy(ptbase->ptentry, &tbm->entry1, sizeof(PagetableEntry));
 			ptpages->index[0] = 0;
 		}
@@ -872,9 +954,9 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 	istate->spages = tbm->ptpages;
 	istate->schunks = tbm->ptchunks;
 
-	ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
-	ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
-	ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
+	ptbase = dsa_get_address(dsa, tbm->dsapagetable);
+	ptpages = dsa_get_address(dsa, tbm->ptpages);
+	ptchunks = dsa_get_address(dsa, tbm->ptchunks);
 
 	/*
 	 * For every shared iterator, referring to pagetable and iterator array,
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index bcb1bc6..2d55a8c 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -102,13 +102,14 @@ static bool eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids,
 static bool bms_equal_any(Relids relids, List *relids_list);
 static void get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 							IndexOptInfo *index, IndexClauseSet *clauses,
-							List **bitindexpaths);
+							List **bitindexpaths, List **partialbitmapipaths);
 static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 							   IndexOptInfo *index, IndexClauseSet *clauses,
 							   bool useful_predicate,
 							   ScanTypeControl scantype,
 							   bool *skip_nonnative_saop,
-							   bool *skip_lower_saop);
+							   bool *skip_lower_saop,
+							   List **partial_ipath);
 static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
 								List *clauses, List *other_clauses);
 static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel,
@@ -232,6 +233,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 {
 	List	   *indexpaths;
 	List	   *bitindexpaths;
+	List	   *partialbitindexpaths = NULL;
 	List	   *bitjoinpaths;
 	List	   *joinorclauses;
 	IndexClauseSet rclauseset;
@@ -274,7 +276,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 		 * bitmap paths are added to bitindexpaths to be handled below.
 		 */
 		get_index_paths(root, rel, index, &rclauseset,
-						&bitindexpaths);
+						&bitindexpaths, &partialbitindexpaths);
 
 		/*
 		 * Identify the join clauses that can match the index.  For the moment
@@ -339,7 +341,22 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 										rel->lateral_relids, 1.0, 0);
 		add_path(rel, (Path *) bpath);
 
-		/* create a partial bitmap heap path */
+		/* Create a partial bitmap heap path */
+		if (rel->consider_parallel && rel->lateral_relids == NULL)
+			create_partial_bitmap_paths(root, rel, bitmapqual);
+	}
+	/*
+	 * Create parial bitmap heap path with partial bitmap index path
+	 * underneath.
+	 * TODO:  We can consider the partial path for Bitmap Or and Bitmap And
+	 * as well.
+	 */
+	if (partialbitindexpaths != NIL)
+	{
+		Path *bitmapqual;
+
+		bitmapqual = choose_bitmap_and(root, rel, partialbitindexpaths);
+
 		if (rel->consider_parallel && rel->lateral_relids == NULL)
 			create_partial_bitmap_paths(root, rel, bitmapqual);
 	}
@@ -659,7 +676,7 @@ get_join_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	Assert(clauseset.nonempty);
 
 	/* Build index path(s) using the collected set of clauses */
-	get_index_paths(root, rel, index, &clauseset, bitindexpaths);
+	get_index_paths(root, rel, index, &clauseset, bitindexpaths, NULL);
 
 	/*
 	 * Remember we considered paths for this set of relids.
@@ -715,7 +732,8 @@ bms_equal_any(Relids relids, List *relids_list)
  *	  Given an index and a set of index clauses for it, construct IndexPaths.
  *
  * Plain indexpaths are sent directly to add_path, while potential
- * bitmap indexpaths are added to *bitindexpaths for later processing.
+ * bitmap indexpaths and partial bitmap indexpaths are added to *bitindexpaths
+ * and partialbitmapipaths respectively for later processing.
  *
  * This is a fairly simple frontend to build_index_paths().  Its reason for
  * existence is mainly to handle ScalarArrayOpExpr quals properly.  If the
@@ -728,9 +746,10 @@ bms_equal_any(Relids relids, List *relids_list)
 static void
 get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 				IndexOptInfo *index, IndexClauseSet *clauses,
-				List **bitindexpaths)
+				List **bitindexpaths, List **partialbitmapipaths)
 {
 	List	   *indexpaths;
+	List	   *partialindexpaths = NULL;
 	bool		skip_nonnative_saop = false;
 	bool		skip_lower_saop = false;
 	ListCell   *lc;
@@ -746,7 +765,8 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 								   index->predOK,
 								   ST_ANYSCAN,
 								   &skip_nonnative_saop,
-								   &skip_lower_saop);
+								   &skip_lower_saop,
+								   &partialindexpaths);
 
 	/*
 	 * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM
@@ -761,7 +781,8 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 												   index->predOK,
 												   ST_ANYSCAN,
 												   &skip_nonnative_saop,
-												   NULL));
+												   NULL,
+												   &partialindexpaths));
 	}
 
 	/*
@@ -788,6 +809,15 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 			 ipath->indexselectivity < 1.0))
 			*bitindexpaths = lappend(*bitindexpaths, ipath);
 	}
+	foreach (lc, partialindexpaths)
+	{
+		IndexPath *ipath = (IndexPath *) lfirst(lc);
+
+		if (partialbitmapipaths && index->amhasgetbitmap &&
+			(ipath->path.pathkeys == NIL ||
+			 ipath->indexselectivity < 1.0))
+			*partialbitmapipaths = lappend(*partialbitmapipaths, ipath);
+	}
 
 	/*
 	 * If there were ScalarArrayOpExpr clauses that the index can't handle
@@ -801,6 +831,7 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 									   false,
 									   ST_BITMAPSCAN,
 									   NULL,
+									   NULL,
 									   NULL);
 		*bitindexpaths = list_concat(*bitindexpaths, indexpaths);
 	}
@@ -853,7 +884,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 				  bool useful_predicate,
 				  ScanTypeControl scantype,
 				  bool *skip_nonnative_saop,
-				  bool *skip_lower_saop)
+				  bool *skip_lower_saop,
+				  List **partial_ipath)
 {
 	List	   *result = NIL;
 	IndexPath  *ipath;
@@ -1066,7 +1098,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 			 * parallel workers, just free it.
 			 */
 			if (ipath->path.parallel_workers > 0)
+			{
 				add_partial_path(rel, (Path *) ipath);
+				*partial_ipath = lappend(*partial_ipath, ipath);
+			}
 			else
 				pfree(ipath);
 		}
@@ -1116,7 +1151,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 				 * using parallel workers, just free it.
 				 */
 				if (ipath->path.parallel_workers > 0)
+				{
 					add_partial_path(rel, (Path *) ipath);
+					*partial_ipath = lappend(*partial_ipath, ipath);
+				}
 				else
 					pfree(ipath);
 			}
@@ -1230,6 +1268,7 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel,
 									   useful_predicate,
 									   ST_BITMAPSCAN,
 									   NULL,
+									   NULL,
 									   NULL);
 		result = list_concat(result, indexpaths);
 	}
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 99278ee..c38b955 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -3295,7 +3295,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 		plan->plan_rows =
 			clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples);
 		plan->plan_width = 0;	/* meaningless */
-		plan->parallel_aware = false;
+		plan->parallel_aware = ipath->path.parallel_aware;
 		plan->parallel_safe = ipath->path.parallel_safe;
 		/* Extract original index clauses, actual index quals, relevant ECs */
 		subquals = NIL;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5110a6b..0342aaa 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -822,7 +822,8 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path)
 		{
 			parent_rel->partial_pathlist =
 				foreach_delete_current(parent_rel->partial_pathlist, p1);
-			pfree(old_path);
+			if (!IsA(old_path, IndexPath))
+				pfree(old_path);
 		}
 		else
 		{
@@ -849,7 +850,8 @@ add_partial_path(RelOptInfo *parent_rel, Path *new_path)
 	else
 	{
 		/* Reject and recycle the new path */
-		pfree(new_path);
+		if (!IsA(new_path, IndexPath))
+			pfree(new_path);
 	}
 }
 
diff --git a/src/include/executor/nodeBitmapIndexscan.h b/src/include/executor/nodeBitmapIndexscan.h
index 42a24e6..a3674c5 100644
--- a/src/include/executor/nodeBitmapIndexscan.h
+++ b/src/include/executor/nodeBitmapIndexscan.h
@@ -14,11 +14,20 @@
 #ifndef NODEBITMAPINDEXSCAN_H
 #define NODEBITMAPINDEXSCAN_H
 
+#include "access/parallel.h"
 #include "nodes/execnodes.h"
 
 extern BitmapIndexScanState *ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags);
 extern Node *MultiExecBitmapIndexScan(BitmapIndexScanState *node);
 extern void ExecEndBitmapIndexScan(BitmapIndexScanState *node);
 extern void ExecReScanBitmapIndexScan(BitmapIndexScanState *node);
+extern void ExecBitmapIndexEstimate(BitmapIndexScanState *node,
+									ParallelContext *pcxt);
+extern void ExecBitmapIndexInitializeDSM(BitmapIndexScanState *node,
+										 ParallelContext *pcxt);
+extern void ExecBitmapIndexInitializeWorker(BitmapIndexScanState *node,
+											ParallelWorkerContext *pwcxt);
+extern void ExecBitmapIndexReInitializeDSM(BitmapIndexScanState *node,
+										   ParallelContext *pcxt);
 
 #endif							/* NODEBITMAPINDEXSCAN_H */
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 90dfa8a..53ac5d5 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -76,6 +76,9 @@
 /* function declarations */
 #define SH_CREATE SH_MAKE_NAME(create)
 #define SH_DESTROY SH_MAKE_NAME(destroy)
+#define	SH_COPY	SH_MAKE_NAME(copy)
+#define	SH_SIZE	SH_MAKE_NAME(size)
+#define	SH_SET_DATA SH_MAKE_NAME(set_data)
 #define SH_RESET SH_MAKE_NAME(reset)
 #define SH_INSERT SH_MAKE_NAME(insert)
 #define SH_INSERT_HASH SH_MAKE_NAME(insert_hash)
@@ -155,6 +158,9 @@ SH_SCOPE	SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements,
 							   void *private_data);
 #endif
 SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
+SH_SCOPE void SH_COPY(SH_TYPE *src, SH_TYPE *dst);
+SH_SCOPE int SH_SIZE(void);
+SH_SCOPE void SH_SET_DATA(SH_TYPE *src, SH_ELEMENT_TYPE *data, void *private);
 SH_SCOPE void SH_RESET(SH_TYPE * tb);
 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found);
@@ -344,6 +350,25 @@ SH_FREE(SH_TYPE * type, void *pointer)
 
 #endif
 
+SH_SCOPE void
+SH_COPY(SH_TYPE *src, SH_TYPE *dst)
+{
+	memcpy(dst, src, sizeof(SH_TYPE));
+}
+
+SH_SCOPE int
+SH_SIZE()
+{
+	return sizeof(SH_TYPE);
+}
+
+SH_SCOPE void
+SH_SET_DATA(SH_TYPE *src, SH_ELEMENT_TYPE *data, void *private_data)
+{
+	src->private_data = private_data;
+	src->data = data;
+}
+
 /*
  * Create a hash table with enough space for `nelements` distinct members.
  * Memory for the hash table is allocated from the passed-in context.  If
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6f96b31..44abc4f 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -22,6 +22,7 @@
 #include "nodes/plannodes.h"
 #include "nodes/tidbitmap.h"
 #include "partitioning/partdefs.h"
+#include "storage/barrier.h"
 #include "storage/condition_variable.h"
 #include "utils/hsearch.h"
 #include "utils/queryenvironment.h"
@@ -1509,6 +1510,7 @@ typedef struct BitmapIndexScanState
 	ExprContext *biss_RuntimeContext;
 	Relation	biss_RelationDesc;
 	struct IndexScanDescData *biss_ScanDesc;
+	Size		biss_PscanLen;
 } BitmapIndexScanState;
 
 /* ----------------
@@ -1535,24 +1537,30 @@ typedef enum
  *	 ParallelBitmapHeapState information
  *		tbmiterator				iterator for scanning current pages
  *		prefetch_iterator		iterator for prefetching ahead of current page
+ *		tbm_shared				shared copy of tidbitmap
+ *		pt_shared				shared copy of pagetable hash
  *		mutex					mutual exclusion for the prefetching variable
  *								and state
  *		prefetch_pages			# pages prefetch iterator is ahead of current
  *		prefetch_target			current target prefetch distance
  *		state					current state of the TIDBitmap
  *		cv						conditional wait variable
- *		phs_snapshot_data		snapshot data shared to workers
+ *		barrier					barrier to wait for workers to create bitmap
+ *		phs_snapshot_data		snapshot data shared to worke
  * ----------------
  */
 typedef struct ParallelBitmapHeapState
 {
 	dsa_pointer tbmiterator;
 	dsa_pointer prefetch_iterator;
+	dsa_pointer	tbm_shared;
+	dsa_pointer	pt_shared;
 	slock_t		mutex;
 	int			prefetch_pages;
 	int			prefetch_target;
 	SharedBitmapState state;
 	ConditionVariable cv;
+	Barrier		barrier;
 	char		phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
 } ParallelBitmapHeapState;
 
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index d562fca..78920b6 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -35,6 +35,7 @@ typedef struct TIDBitmap TIDBitmap;
 /* Likewise, TBMIterator is private */
 typedef struct TBMIterator TBMIterator;
 typedef struct TBMSharedIterator TBMSharedIterator;
+typedef struct TBMSharedData TBMSharedData;
 
 /* Result structure for tbm_iterate */
 typedef struct TBMIterateResult
@@ -63,7 +64,8 @@ extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 extern bool tbm_is_empty(const TIDBitmap *tbm);
 
 extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
-extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm);
+extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm, dsa_area *dsa,
+											  dsa_pointer dp_pagetable);
 extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
 extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
 extern void tbm_end_iterate(TBMIterator *iterator);
@@ -71,5 +73,7 @@ extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
 													dsa_pointer dp);
 extern long tbm_calculate_entries(double maxbytes);
+extern void tbm_merge(TIDBitmap *tbm, dsa_pointer *dp_tbm,
+							 dsa_pointer *dp_pt);
 
 #endif							/* TIDBITMAP_H */
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 96dfb7c..c374c58 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -512,8 +512,8 @@ END $$;
 set work_mem='64kB';  --set small work mem to force lossy pages
 explain (costs off)
 	select count(*) from tenk1, tenk2 where tenk1.hundred > 1 and tenk2.thousand=0;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                             QUERY PLAN                              
+---------------------------------------------------------------------
  Aggregate
    ->  Nested Loop
          ->  Seq Scan on tenk2
@@ -522,7 +522,7 @@ explain (costs off)
                Workers Planned: 4
                ->  Parallel Bitmap Heap Scan on tenk1
                      Recheck Cond: (hundred > 1)
-                     ->  Bitmap Index Scan on tenk1_hundred
+                     ->  Parallel Bitmap Index Scan on tenk1_hundred
                            Index Cond: (hundred > 1)
 (10 rows)
 
-- 
1.8.3.1

