diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index de0e2ba..6357f29 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -25,6 +25,7 @@
 
 #include "executor/execParallel.h"
 #include "executor/executor.h"
+#include "executor/nodeAppend.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
 #include "executor/nodeSeqscan.h"
@@ -213,6 +214,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 				ExecForeignScanEstimate((ForeignScanState *) planstate,
 										e->pcxt);
 				break;
+			case T_AppendState:
+				ExecAppendEstimate((AppendState *) planstate,
+										e->pcxt);
+				break;
 			case T_CustomScanState:
 				ExecCustomScanEstimate((CustomScanState *) planstate,
 									   e->pcxt);
@@ -273,6 +278,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 				ExecForeignScanInitializeDSM((ForeignScanState *) planstate,
 											 d->pcxt);
 				break;
+			case T_AppendState:
+				ExecAppendInitializeDSM((AppendState *) planstate,
+										 d->pcxt);
+				break;
 			case T_CustomScanState:
 				ExecCustomScanInitializeDSM((CustomScanState *) planstate,
 											d->pcxt);
@@ -771,6 +780,9 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
 				ExecForeignScanInitializeWorker((ForeignScanState *) planstate,
 												toc);
 				break;
+			case T_AppendState:
+				ExecAppendInitializeWorker((AppendState *) planstate, toc);
+				break;
 			case T_CustomScanState:
 				ExecCustomScanInitializeWorker((CustomScanState *) planstate,
 											   toc);
diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c
index 6986cae..a5ffb38 100644
--- a/src/backend/executor/nodeAppend.c
+++ b/src/backend/executor/nodeAppend.c
@@ -59,9 +59,56 @@
 
 #include "executor/execdebug.h"
 #include "executor/nodeAppend.h"
+#include "miscadmin.h"
+#include "optimizer/cost.h"
+#include "storage/spin.h"
 
+/*
+ * Shared state for Parallel Append.
+ *
+ * Each backend participating in a Parallel Append has its own
+ * descriptor in backend-private memory, and those objects all contain
+ * a pointer to this structure.
+ */
+typedef struct ParallelAppendInfo
+{
+	/*
+	 * pa_num_workers : workers currently executing the subplan. A worker which
+	 * finishes a subplan should set pa_num_workers to -1, so that no new
+	 * worker pick this subplan. For non-partial subplan, a worker which picks
+	 * up that subplan, it should immediately set to -1, so as to make sure
+	 * there are no more than 1 worker assigned to this subplan. In general, -1
+	 * means workers should stop picking it.
+	 */
+	int		pa_num_workers;
+
+}	ParallelAppendInfo;
+
+typedef struct ParallelAppendDescData
+{
+	slock_t		pa_mutex;		/* mutual exclusion to choose next subplan */
+	ParallelAppendInfo pa_info[FLEXIBLE_ARRAY_MEMBER];
+} ParallelAppendDescData;
+
+typedef ParallelAppendDescData *ParallelAppendDesc;
+
+/*
+ * Special value of AppendState->as_whichplan for Parallel Append, which
+ * indicates there are no plans left to be executed.
+ */
+#define PA_INVALID_PLAN -1
+
+
+static void exec_append_scan_first(AppendState *appendstate);
 static bool exec_append_initialize_next(AppendState *appendstate);
+static void set_finished(ParallelAppendDesc padesc, int whichplan);
+static bool parallel_append_next(AppendState *state);
 
+static inline void
+exec_append_scan_first(AppendState *appendstate)
+{
+	appendstate->as_whichplan = 0;
+}
 
 /* ----------------------------------------------------------------
  *		exec_append_initialize_next
@@ -77,6 +124,27 @@ exec_append_initialize_next(AppendState *appendstate)
 	int			whichplan;
 
 	/*
+	 * In case it's parallel-aware, follow it's own logic of choosing the next
+	 * subplan.
+	 */
+	if (appendstate->as_padesc)
+	{
+		/* Backward scan is not supported by parallel-aware plans */
+		Assert(!ScanDirectionIsBackward(appendstate->ps.state->es_direction));
+
+		return parallel_append_next(appendstate);
+	}
+
+	/*
+	 * Not parallel-aware. Fine, just go on to the next subplan in the
+	 * appropriate direction.
+	 */
+	if (ScanDirectionIsForward(appendstate->ps.state->es_direction))
+		appendstate->as_whichplan++;
+	else
+		appendstate->as_whichplan--;
+
+	/*
 	 * get information from the append node
 	 */
 	whichplan = appendstate->as_whichplan;
@@ -178,8 +246,7 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
 	/*
 	 * initialize to scan first subplan
 	 */
-	appendstate->as_whichplan = 0;
-	exec_append_initialize_next(appendstate);
+	exec_append_scan_first(appendstate);
 
 	return appendstate;
 }
@@ -198,6 +265,14 @@ ExecAppend(AppendState *node)
 		PlanState  *subnode;
 		TupleTableSlot *result;
 
+		/* Check if we are already finished plans from parallel append */
+		if (node->as_padesc && node->as_whichplan == PA_INVALID_PLAN)
+		{
+			elog(DEBUG2, "ParallelAppend : pid %d : all plans already finished",
+						 MyProcPid);
+			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
+		}
+
 		/*
 		 * figure out which subplan we are currently processing
 		 */
@@ -219,14 +294,18 @@ ExecAppend(AppendState *node)
 		}
 
 		/*
-		 * Go on to the "next" subplan in the appropriate direction. If no
-		 * more subplans, return the empty slot set up for us by
-		 * ExecInitAppend.
+		 * We are done with this subplan. There might be other workers still
+		 * processing the last chunk of rows for this same subplan, but there's
+		 * no point for new workers to run this subplan, so mark this subplan
+		 * as finished.
+		 */
+		if (node->as_padesc)
+			set_finished(node->as_padesc, node->as_whichplan);
+
+		/*
+		 * Go on to the "next" subplan. If no more subplans, return the empty
+		 * slot set up for us by ExecInitAppend.
 		 */
-		if (ScanDirectionIsForward(node->ps.state->es_direction))
-			node->as_whichplan++;
-		else
-			node->as_whichplan--;
 		if (!exec_append_initialize_next(node))
 			return ExecClearTuple(node->ps.ps_ResultTupleSlot);
 
@@ -270,6 +349,7 @@ ExecReScanAppend(AppendState *node)
 	for (i = 0; i < node->as_nplans; i++)
 	{
 		PlanState  *subnode = node->appendplans[i];
+		ParallelAppendDesc padesc = node->as_padesc;
 
 		/*
 		 * ExecReScan doesn't know about my subplans, so I have to do
@@ -284,7 +364,204 @@ ExecReScanAppend(AppendState *node)
 		 */
 		if (subnode->chgParam == NULL)
 			ExecReScan(subnode);
+
+		if (padesc)
+		{
+			/*
+			 * Just setting all the number of workers to 0 is enough. The logic
+			 * of choosing the next plan will take care of everything else.
+			 * pa_max_workers is already set initially.
+			 */
+			padesc->pa_info[i].pa_num_workers = 0;
+		}
+	}
+
+	exec_append_scan_first(node);
+}
+
+/* ----------------------------------------------------------------
+ *						Parallel Append Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecAppendEstimate
+ *
+ *		estimates the space required to serialize Append node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendEstimate(AppendState *node,
+					ParallelContext *pcxt)
+{
+	node->pappend_len =
+		add_size(offsetof(struct ParallelAppendDescData, pa_info),
+				 sizeof(*node->as_padesc->pa_info) * node->as_nplans);
+
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pappend_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecAppendInitializeDSM
+ *
+ *		Set up a Parallel Append descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeDSM(AppendState *node,
+						 ParallelContext *pcxt)
+{
+	ParallelAppendDesc padesc;
+	int			i;
+
+	padesc = shm_toc_allocate(pcxt->toc, node->pappend_len);
+	SpinLockInit(&padesc->pa_mutex);
+
+	for (i = 0; i < node->as_nplans; i++)
+	{
+		/*
+		 * Just setting all the number of workers to 0 is enough. The logic
+		 * of choosing the next plan in workers will take care of everything
+		 * else.
+		 */
+		padesc->pa_info[i].pa_num_workers = 0;
+	}
+
+	shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, padesc);
+	node->as_padesc = padesc;
+
+	/* Choose the optimal subplan to be executed. */
+	(void) parallel_append_next(node);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAppendInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate, and initialize
+ *		whatever is required to choose and execute the optimal subplan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAppendInitializeWorker(AppendState *node, shm_toc *toc)
+{
+	node->as_padesc = shm_toc_lookup(toc, node->ps.plan->plan_node_id);
+
+	/* Choose the optimal subplan to be executed. */
+	(void) parallel_append_next(node);
+}
+
+/* ----------------------------------------------------------------
+ *		set_finished
+ *
+ *		Indicate that this child plan node is about to be finished, so no other
+ *		workers should take up this node. Workers who are already executing
+ *		this node will continue to do so, but workers looking for next nodes to
+ *		pick up would skip this node after this function is called. It is
+ *		possible that multiple workers call this function for the same node at
+ *		the same time, because these workers were executing the same node and
+ *		they finished with it at the same time. The spinlock is not for this
+ *		purpose. The spinlock is used so that it does not change the
+ *		pa_num_workers field while workers are choosing the next node.
+ * ----------------------------------------------------------------
+ */
+static void
+set_finished(ParallelAppendDesc padesc, int whichplan)
+{
+	elog(DEBUG2, "Parallelappend : pid %d : finishing plan %d",
+				 MyProcPid, whichplan);
+
+	SpinLockAcquire(&padesc->pa_mutex);
+	padesc->pa_info[whichplan].pa_num_workers = -1;
+	SpinLockRelease(&padesc->pa_mutex);
+}
+
+/* ----------------------------------------------------------------
+ *		parallel_append_next
+ *
+ *		Determine the optimal subplan that should be executed. The logic is to
+ *		choose the subplan that is being executed by the least number of
+ *		workers.
+ *
+ *		Returns false if and only if all subplans are already finished
+ *		processing.
+ * ----------------------------------------------------------------
+ */
+static bool
+parallel_append_next(AppendState *state)
+{
+	ParallelAppendDesc padesc = state->as_padesc;
+	int		whichplan;
+	int		min_whichplan = PA_INVALID_PLAN;
+	int		min_workers = -1; /* Keep compiler quiet */
+
+	Assert(padesc != NULL);
+
+	SpinLockAcquire(&padesc->pa_mutex);
+
+	/* Choose the plan with the least number of workers */
+	for (whichplan = 0; whichplan < state->as_nplans; whichplan++)
+	{
+		ParallelAppendInfo *painfo = &padesc->pa_info[whichplan];
+
+		/*
+		 * Ignore plans that are already done processing. These also include
+		 * non-partial subplans which have already been taken by a worker.
+		 */
+		if (painfo->pa_num_workers == -1)
+			continue;
+
+		/*
+		 * Keep track of the node with the least workers so far. For the very
+		 * first plan, choose that one as the least-workers node.
+		 */
+		if (min_whichplan == PA_INVALID_PLAN ||
+			painfo->pa_num_workers < min_workers)
+		{
+			min_whichplan = whichplan;
+			min_workers = painfo->pa_num_workers;
+		}
 	}
-	node->as_whichplan = 0;
-	exec_append_initialize_next(node);
+
+
+	/*
+	 * Increment worker count for the chosen node, if at all we found one.
+	 * For non-partial plans, set it to -1 instead, so that no other workers
+	 * run it.
+	 */
+	if (min_whichplan != PA_INVALID_PLAN)
+	{
+		if (bms_is_member(min_whichplan,
+						  ((Append*)state->ps.plan)->partial_subplans_set))
+			padesc->pa_info[min_whichplan].pa_num_workers++;
+		else
+			padesc->pa_info[min_whichplan].pa_num_workers = -1;
+	}
+
+	/*
+	 * Save the chosen plan index. It can be PA_INVALID_PLAN, which means we
+	 * are done with all nodes (Note : this meaning applies only to *parallel*
+	 * append).
+	 */
+	state->as_whichplan = min_whichplan;
+
+	/*
+	 * Note: There is a chance that just after the child plan node is chosen
+	 * here and spinlock released, some other worker finishes this node and
+	 * calls set_finished(). In that case, this worker will go ahead and call
+	 * ExecProcNode(child_node), which will return NULL tuple since it is
+	 * already finished, and then once again this worker will try to choose
+	 * next subplan; but this is ok : it's just an extra "choose_next_subplan"
+	 * operation.
+	 */
+	SpinLockRelease(&padesc->pa_mutex);
+	elog(DEBUG2, "ParallelAppend : pid %d : Chosen plan : %d",
+				 MyProcPid, min_whichplan);
+
+	/*
+	 * If we didn't find any node to work on, stop executing. Indicate the same
+	 * by returning false.
+	 */
+	return (min_whichplan == PA_INVALID_PLAN ? false : true);
 }
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index bb2a8a3..67f722a 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -236,6 +236,7 @@ _copyAppend(const Append *from)
 	 * copy remainder of node
 	 */
 	COPY_NODE_FIELD(appendplans);
+	COPY_BITMAPSET_FIELD(partial_subplans_set);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index b3802b4..69f1139 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -369,6 +369,7 @@ _outAppend(StringInfo str, const Append *node)
 	_outPlanInfo(str, (const Plan *) node);
 
 	WRITE_NODE_FIELD(appendplans);
+	WRITE_BITMAPSET_FIELD(partial_subplans_set);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 05bf2e9..6d3ca5d 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1537,6 +1537,7 @@ _readAppend(void)
 	ReadCommonPlan(&local_node->plan);
 
 	READ_NODE_FIELD(appendplans);
+	READ_BITMAPSET_FIELD(partial_subplans_set);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 87a3faf..7a59c8e 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1232,14 +1232,50 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 		 */
 		if (childrel->cheapest_total_path->param_info == NULL)
 			subpaths = accumulate_append_subpath(subpaths,
-											  childrel->cheapest_total_path);
+												 childrel->cheapest_total_path);
 		else
 			subpaths_valid = false;
 
 		/* Same idea, but for a partial plan. */
 		if (childrel->partial_pathlist != NIL)
+		{
 			partial_subpaths = accumulate_append_subpath(partial_subpaths,
 									   linitial(childrel->partial_pathlist));
+		}
+		else if (enable_parallelappend)
+		{
+			/*
+			 * Extract the first unparameterized, parallel-safe one among the
+			 * child paths.
+			 */
+			Path *parallel_safe_path = NULL;
+			foreach(lcp, childrel->pathlist)
+			{
+				Path *child_path = (Path *) lfirst(lcp);
+				if (child_path->parallel_safe &&
+					child_path->param_info == NULL)
+				{
+					parallel_safe_path = child_path;
+					break;
+				}
+			}
+
+			/* If we got one parallel-safe path, add it */
+			if (parallel_safe_path)
+			{
+				partial_subpaths =
+					accumulate_append_subpath(partial_subpaths,
+											  parallel_safe_path);
+			}
+			else
+			{
+				/*
+				 * This child rel neither has a partial path, nor has a
+				 * parallel-safe path. So drop the idea for partial append path.
+				 */
+				partial_subpaths_valid = false;
+			}
+		}
 		else
 			partial_subpaths_valid = false;
 
@@ -1322,24 +1358,10 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 	if (partial_subpaths_valid)
 	{
 		AppendPath *appendpath;
-		ListCell   *lc;
-		int			parallel_workers = 0;
-
-		/*
-		 * Decide on the number of workers to request for this append path.
-		 * For now, we just use the maximum value from among the members.  It
-		 * might be useful to use a higher number if the Append node were
-		 * smart enough to spread out the workers, but it currently isn't.
-		 */
-		foreach(lc, partial_subpaths)
-		{
-			Path	   *path = lfirst(lc);
+		int			parallel_workers;
 
-			parallel_workers = Max(parallel_workers, path->parallel_workers);
-		}
-		Assert(parallel_workers > 0);
+		parallel_workers = get_append_num_workers(partial_subpaths);
 
-		/* Generate a partial append path. */
 		appendpath = create_append_path(rel, partial_subpaths, NULL,
 										parallel_workers);
 		add_partial_path(rel, (Path *) appendpath);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c138f57..ccd6733 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -126,6 +126,7 @@ bool		enable_nestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;
+bool		enable_parallelappend = true;
 
 typedef struct
 {
@@ -1559,6 +1560,70 @@ cost_sort(Path *path, PlannerInfo *root,
 }
 
 /*
+ * cost_append
+ *	  Determines and returns the cost of an Append node.
+ *
+ * We charge nothing extra for the Append itself, which perhaps is too
+ * optimistic, but since it doesn't do any selection or projection, it is a
+ * pretty cheap node.
+ */
+void
+cost_append(Path *path, List *subpaths)
+{
+	ListCell *l;
+
+	path->rows = 0;
+	path->startup_cost = 0;
+	path->total_cost = 0;
+
+	if (path->parallel_aware)
+	{
+		int parallel_divisor;
+
+		foreach(l, subpaths)
+		{
+			Path	   *subpath = (Path *) lfirst(l);
+
+			/*
+			 * The subpath rows and cost is per worker. We need total count
+			 * of each of the subpaths, so that we can determine the total cost
+			 * of Append.
+			 */
+			parallel_divisor = get_parallel_divisor(subpath);
+			path->rows += (subpath->rows * parallel_divisor);
+			path->total_cost += (subpath->total_cost * parallel_divisor);
+
+			/*
+			 * Append would start returning tuples when the child node having
+			 * lowest startup cost is done setting up.
+			 */
+			path->startup_cost = Min(path->startup_cost,
+												  subpath->startup_cost);
+		}
+
+		/* The row count and cost should represent per-worker figures. */
+		parallel_divisor = get_parallel_divisor(path);
+		path->rows = clamp_row_est(path->rows / parallel_divisor);
+		path->total_cost /= parallel_divisor;
+
+	}
+	else
+	{
+		/* Compute rows and costs as sums of subplan rows and costs. */
+		foreach(l, subpaths)
+		{
+			Path	   *subpath = (Path *) lfirst(l);
+
+			path->rows += subpath->rows;
+
+			path->total_cost += subpath->total_cost;
+			if (l == list_head(subpaths))	/* first node? */
+				path->startup_cost = subpath->startup_cost;
+		}
+	}
+}
+
+/*
  * cost_merge_append
  *	  Determines and returns the cost of a MergeAppend node.
  *
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 1e953b4..04b0414 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -29,6 +29,7 @@
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/placeholder.h"
 #include "optimizer/plancat.h"
@@ -194,7 +195,8 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual,
 			 Index scanrelid, int ctePlanId, int cteParam);
 static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
 				   Index scanrelid, int wtParam);
-static Append *make_append(List *appendplans, List *tlist);
+static Append *make_append(List *appendplans, Bitmapset *partial_plans_set,
+						   List *tlist);
 static RecursiveUnion *make_recursive_union(List *tlist,
 					 Plan *lefttree,
 					 Plan *righttree,
@@ -962,6 +964,8 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
 	List	   *tlist = build_path_tlist(root, &best_path->path);
 	List	   *subplans = NIL;
 	ListCell   *subpaths;
+	Bitmapset  *partial_subplans_set;
+	int			i;
 
 	/*
 	 * The subpaths list could be empty, if every child was proven empty by
@@ -987,12 +991,25 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
 		return plan;
 	}
 
-	/* Build the plan for each child */
+	/* Build the plan for each child, and a bitmapset of partial subpaths */
+	partial_subplans_set = NULL;
+	i = 0;
 	foreach(subpaths, best_path->subpaths)
 	{
 		Path	   *subpath = (Path *) lfirst(subpaths);
+		RelOptInfo *rel = subpath->parent;
 		Plan	   *subplan;
 
+		/*
+		 * If this subpath is actually the cheapest partial path, add this into
+		 * the partial path set.
+		 */
+		if (rel->partial_pathlist != NIL &&
+			(Path *) linitial(rel->partial_pathlist) == subpath)
+			partial_subplans_set = bms_add_member(partial_subplans_set, i);
+
+		i++;
+
 		/* Must insist that all children return the same tlist */
 		subplan = create_plan_recurse(root, subpath, CP_EXACT_TLIST);
 
@@ -1006,7 +1023,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
 	 * parent-rel Vars it'll be asked to emit.
 	 */
 
-	plan = make_append(subplans, tlist);
+	plan = make_append(subplans, partial_subplans_set, tlist);
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -5003,7 +5020,7 @@ make_foreignscan(List *qptlist,
 }
 
 static Append *
-make_append(List *appendplans, List *tlist)
+make_append(List *appendplans, Bitmapset *partial_plans_set, List *tlist)
 {
 	Append	   *node = makeNode(Append);
 	Plan	   *plan = &node->plan;
@@ -5013,6 +5030,7 @@ make_append(List *appendplans, List *tlist)
 	plan->lefttree = NULL;
 	plan->righttree = NULL;
 	node->appendplans = appendplans;
+	node->partial_subplans_set = partial_plans_set;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index ca0ae78..fb91264 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3334,10 +3334,7 @@ create_grouping_paths(PlannerInfo *root,
 				paths = lappend(paths, path);
 			}
 			path = (Path *)
-				create_append_path(grouped_rel,
-								   paths,
-								   NULL,
-								   0);
+				create_append_path(grouped_rel, paths, NULL, 0);
 			path->pathtarget = target;
 		}
 		else
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 3248296..1b8e362 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1192,6 +1192,67 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals,
 }
 
 /*
+ * get_append_num_workers
+ *    Return the number of workers to request for partial append path.
+ */
+int
+get_append_num_workers(List *subpaths)
+{
+	ListCell   *lc;
+	double		log2w;
+	int			num_workers;
+	int			max_per_plan_workers;
+
+	/*
+	 * log2(number_of_subpaths)+1 formula seems to give an appropriate number of
+	 * workers for Append path either having high number of children (> 100) or
+	 * having all non-partial subpaths or subpaths with 1-2 parallel_workers.
+	 * Whereas, if the subpaths->parallel_workers is high, this formula is not
+	 * suitable, because it does not take into account per-subpath workers.
+	 * For e.g., with workers (2, 8, 8), the Append workers should be at least
+	 * 8, whereas the formula gives 2. In this case, it seems better to follow
+	 * the method used for calculating parallel_workers of an unpartitioned
+	 * table : log3(table_size). So we treat the UNION query as if the data
+	 * belongs to a single unpartitioned table, and then derive its workers. So
+	 * it will be : logb(b^w1 + b^w2 + b^w3) where w1, w2.. are per-subplan
+	 * workers and b is some logarithmic base such as 2 or 3. It turns out that
+	 * this evaluates to a value just a bit greater than max(w1,w2, w3). So, we
+	 * just use the maximum of workers formula. But this formula gives too few
+	 * workers when all paths have single worker (meaning they are non-partial)
+	 * For e.g. with workers : (1, 1, 1, 1, 1, 1), it is better to allocate 3
+	 * workers, whereas this method allocates only 1.
+	 * So we use whichever method that gives higher number of workers.
+	 */
+
+	/* Get log2(num_subpaths) i.e. ln(num_subpaths) / ln(2)  */
+	log2w = log(list_length(subpaths)) / 0.693 ;
+
+	/* Avoid further calculations if we already crossed max workers limit */
+	if (max_parallel_workers_per_gather <= log2w + 1)
+		return max_parallel_workers_per_gather;
+
+
+	/*
+	 * Get the parallel_workers value of the subpath having the highest
+	 * parallel_workers.
+	 */
+	max_per_plan_workers = 1;
+	foreach(lc, subpaths)
+	{
+		Path	   *subpath = lfirst(lc);
+		max_per_plan_workers = Max(max_per_plan_workers,
+								   subpath->parallel_workers);
+	}
+
+	num_workers = rint(Max(log2w, max_per_plan_workers) + 1);
+
+	/* In no case use more than max_parallel_workers_per_gather workers. */
+	num_workers = Min(num_workers, max_parallel_workers_per_gather);
+
+	return num_workers;
+}
+
+/*
  * create_append_path
  *	  Creates a path corresponding to an Append plan, returning the
  *	  pathnode.
@@ -1210,40 +1271,27 @@ create_append_path(RelOptInfo *rel, List *subpaths, Relids required_outer,
 	pathnode->path.pathtarget = rel->reltarget;
 	pathnode->path.param_info = get_appendrel_parampathinfo(rel,
 															required_outer);
-	pathnode->path.parallel_aware = false;
+	pathnode->path.parallel_aware =
+		(enable_parallelappend && parallel_workers > 0);
 	pathnode->path.parallel_safe = rel->consider_parallel;
 	pathnode->path.parallel_workers = parallel_workers;
 	pathnode->path.pathkeys = NIL;		/* result is always considered
 										 * unsorted */
 	pathnode->subpaths = subpaths;
 
-	/*
-	 * We don't bother with inventing a cost_append(), but just do it here.
-	 *
-	 * Compute rows and costs as sums of subplan rows and costs.  We charge
-	 * nothing extra for the Append itself, which perhaps is too optimistic,
-	 * but since it doesn't do any selection or projection, it is a pretty
-	 * cheap node.
-	 */
-	pathnode->path.rows = 0;
-	pathnode->path.startup_cost = 0;
-	pathnode->path.total_cost = 0;
 	foreach(l, subpaths)
 	{
 		Path	   *subpath = (Path *) lfirst(l);
 
-		pathnode->path.rows += subpath->rows;
-
-		if (l == list_head(subpaths))	/* first node? */
-			pathnode->path.startup_cost = subpath->startup_cost;
-		pathnode->path.total_cost += subpath->total_cost;
 		pathnode->path.parallel_safe = pathnode->path.parallel_safe &&
-			subpath->parallel_safe;
+									   subpath->parallel_safe;
 
 		/* All child paths must have same parameterization */
 		Assert(bms_equal(PATH_REQ_OUTER(subpath), required_outer));
 	}
 
+	cost_append(&pathnode->path, subpaths);
+
 	return pathnode;
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f8b073d..2994413 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -902,6 +902,16 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_parallelappend", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of parallel append plans."),
+			NULL
+		},
+		&enable_parallelappend,
+		true,
+		NULL, NULL, NULL
+	},
+
 
 	{
 		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
diff --git a/src/include/executor/nodeAppend.h b/src/include/executor/nodeAppend.h
index 6fb4662..e76027f 100644
--- a/src/include/executor/nodeAppend.h
+++ b/src/include/executor/nodeAppend.h
@@ -14,11 +14,15 @@
 #ifndef NODEAPPEND_H
 #define NODEAPPEND_H
 
+#include "access/parallel.h"
 #include "nodes/execnodes.h"
 
 extern AppendState *ExecInitAppend(Append *node, EState *estate, int eflags);
 extern TupleTableSlot *ExecAppend(AppendState *node);
 extern void ExecEndAppend(AppendState *node);
 extern void ExecReScanAppend(AppendState *node);
+extern void ExecAppendEstimate(AppendState *node, ParallelContext *pcxt);
+extern void ExecAppendInitializeDSM(AppendState *node, ParallelContext *pcxt);
+extern void ExecAppendInitializeWorker(AppendState *node, shm_toc *toc);
 
 #endif   /* NODEAPPEND_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6332ea0..c887be6 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -21,6 +21,7 @@
 #include "lib/pairingheap.h"
 #include "nodes/params.h"
 #include "nodes/plannodes.h"
+#include "storage/spin.h"
 #include "utils/hsearch.h"
 #include "utils/reltrigger.h"
 #include "utils/sortsupport.h"
@@ -1185,12 +1186,15 @@ typedef struct ModifyTableState
  *		whichplan		which plan is being executed (0 .. n-1)
  * ----------------
  */
+struct ParallelAppendDescData;
 typedef struct AppendState
 {
 	PlanState	ps;				/* its first field is NodeTag */
 	PlanState **appendplans;	/* array of PlanStates for my inputs */
 	int			as_nplans;
 	int			as_whichplan;
+	struct ParallelAppendDescData *as_padesc; /* parallel coordination info */
+	Size		pappend_len;	/* size of parallel coordination info */
 } AppendState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index f72f7a8..6d772ca 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -228,6 +228,7 @@ typedef struct Append
 {
 	Plan		plan;
 	List	   *appendplans;
+	Bitmapset  *partial_subplans_set;
 } Append;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 72200fa..484e179 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -66,6 +66,7 @@ extern bool enable_nestloop;
 extern bool enable_material;
 extern bool enable_mergejoin;
 extern bool enable_hashjoin;
+extern bool enable_parallelappend;
 extern int	constraint_exclusion;
 
 extern double clamp_row_est(double nrows);
@@ -98,6 +99,7 @@ extern void cost_sort(Path *path, PlannerInfo *root,
 		  List *pathkeys, Cost input_cost, double tuples, int width,
 		  Cost comparison_cost, int sort_mem,
 		  double limit_tuples);
+extern void cost_append(Path *path, List *subpaths);
 extern void cost_merge_append(Path *path, PlannerInfo *root,
 				  List *pathkeys, int n_streams,
 				  Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 53cad24..dbec534 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -14,6 +14,7 @@
 #ifndef PATHNODE_H
 #define PATHNODE_H
 
+#include "nodes/bitmapset.h"
 #include "nodes/relation.h"
 
 
@@ -62,8 +63,10 @@ extern BitmapOrPath *create_bitmap_or_path(PlannerInfo *root,
 					  List *bitmapquals);
 extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel,
 					List *tidquals, Relids required_outer);
-extern AppendPath *create_append_path(RelOptInfo *rel, List *subpaths,
-				   Relids required_outer, int parallel_workers);
+extern int get_append_num_workers(List *subpaths);
+extern AppendPath *create_append_path(RelOptInfo *rel,
+					   List *subpaths, Relids required_outer,
+					   int parallel_workers);
 extern MergeAppendPath *create_merge_append_path(PlannerInfo *root,
 						 RelOptInfo *rel,
 						 List *subpaths,
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 795d9f5..367d23f 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -1337,6 +1337,7 @@ select min(1-id) from matest0;
 
 reset enable_indexscan;
 set enable_seqscan = off;  -- plan with fewest seqscans should be merge
+set enable_parallelappend = off; -- Don't let parallel-append interfere
 explain (verbose, costs off) select * from matest0 order by 1-id;
                             QUERY PLAN                            
 ------------------------------------------------------------------
@@ -1403,6 +1404,7 @@ select min(1-id) from matest0;
 (1 row)
 
 reset enable_seqscan;
+reset enable_parallelappend;
 drop table matest0 cascade;
 NOTICE:  drop cascades to 3 other objects
 DETAIL:  drop cascades to table matest1
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 75558d0..3071bae 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -17,9 +17,9 @@ explain (costs off)
 -----------------------------------------------------
  Finalize Aggregate
    ->  Gather
-         Workers Planned: 1
+         Workers Planned: 4
          ->  Partial Aggregate
-               ->  Append
+               ->  Parallel Append
                      ->  Parallel Seq Scan on a_star
                      ->  Parallel Seq Scan on b_star
                      ->  Parallel Seq Scan on c_star
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index d48abd7..7a303fa 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -70,20 +70,21 @@ select count(*) >= 0 as ok from pg_prepared_xacts;
 -- This is to record the prevailing planner enable_foo settings during
 -- a regression test run.
 select name, setting from pg_settings where name like 'enable%';
-         name         | setting 
-----------------------+---------
- enable_bitmapscan    | on
- enable_hashagg       | on
- enable_hashjoin      | on
- enable_indexonlyscan | on
- enable_indexscan     | on
- enable_material      | on
- enable_mergejoin     | on
- enable_nestloop      | on
- enable_seqscan       | on
- enable_sort          | on
- enable_tidscan       | on
-(11 rows)
+         name          | setting 
+-----------------------+---------
+ enable_bitmapscan     | on
+ enable_hashagg        | on
+ enable_hashjoin       | on
+ enable_indexonlyscan  | on
+ enable_indexscan      | on
+ enable_material       | on
+ enable_mergejoin      | on
+ enable_nestloop       | on
+ enable_parallelappend | on
+ enable_seqscan        | on
+ enable_sort           | on
+ enable_tidscan        | on
+(12 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 836ec22..0636f08 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -462,11 +462,13 @@ select min(1-id) from matest0;
 reset enable_indexscan;
 
 set enable_seqscan = off;  -- plan with fewest seqscans should be merge
+set enable_parallelappend = off; -- Don't let parallel-append interfere
 explain (verbose, costs off) select * from matest0 order by 1-id;
 select * from matest0 order by 1-id;
 explain (verbose, costs off) select min(1-id) from matest0;
 select min(1-id) from matest0;
 reset enable_seqscan;
+reset enable_parallelappend;
 
 drop table matest0 cascade;