Re: Extracting only the columns needed for a query

Pengzhou Tang Fri, 14 Feb 2020 02:00:28 -0800

> > > On Sat, Jun 15, 2019 at 10:02 AM Tom Lane <t...@sss.pgh.pa.us> wrote:
> > >
> > > Another reason for having the planner do this is that presumably, in
> > > an AM that's excited about this, the set of fetched columns should
> > > play into the cost estimates for the scan.  I've not been paying
> > > enough attention to the tableam work to know if we've got hooks for
> > > the AM to affect scan costing ... but if we don't, that seems like
> > > a hole that needs plugged.
> >
> > AM callback relation_estimate_size exists currently which planner
> leverages.
> > Via this callback it fetches tuples, pages, etc.. So, our thought is to
> extend
> > this API if possible to pass down needed column and help perform better
> costing
> > for the query. Though we think if wish to leverage this function, need
> to know
> > list of columns before planning hence might need to use query tree.
>
> I believe it would be beneficial to add this potential API extension patch
> into
> the thread (as an example of an interface defining how scanCols could be
> used)
> and review them together.
>
> Thanks for your suggestion, we paste one potential API extension change
bellow for zedstore to use scanCols.


The change contains 3 patches to clarify our idea.
0001-ANALYZE.patch is a generic patch for ANALYZE API extension, we develop
it to make the
analysis of zedstore tables more accurate. It is more flexible now, eg,
TableAm can provide
logical block number as random sample seed; TableAm can only analyze
specified columns; TableAm
can provide extra info besides the data tuple.

0002-Planner.patch is the real patch to show how we use rte->scanCols for a
cost estimate, the main idea
is adding a new metric 'stadiskfrac' to catalog pg_statistic, 'stadiskfrac'
is the physical size ratio of a column,
it is calculated when ANALYZE is performed, 0001-ANALYZE.patch can help to
provide extra disk size info.
So when set_plain_rel_size() is called by the planner, it uses
rte->scanCols and 'stadiskfrac' to adjust the
rel->pages, please see set_plain_rel_page_estimates().

0003-ZedStore.patch is an example of how zedstore uses extended ANALYZE
API, I paste it here anywhere, in case someone
is interest in it.

Thanks,
Pengzhou

From 77d257ab002a5e1b6a2f65e359cbfd7978e3cff5 Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:42:37 -0500
Subject: [PATCH 1/3] ANALYZE tableam API change

Extended three ANALYZE-related tableam APIs so AMs can take more control
of ANALYZE progress:
- scan_analyze_beginscan() : so AMs can has more flexible sampling strategy
- scan_analyze_sample_tuple() : so ANALYZE can get extra info as needed
- scan_analyze_endscan() :

Also use struct AnalyzeSampleContext to provide more convenience, with it
tableam analyze routines can provide extra info except the real data,
for example: physical size or compression ratio.
---
 contrib/file_fdw/file_fdw.c              |  35 +++---
 contrib/postgres_fdw/postgres_fdw.c      |  56 +++++----
 src/backend/access/heap/heapam_handler.c | 109 ++++++++++++++--
 src/backend/access/table/tableam.c       | 209 +++++++++++++++++++++++++++++++
 src/backend/commands/analyze.c           | 181 ++++++++------------------
 src/include/access/tableam.h             | 138 +++++++++++++++++---
 src/include/foreign/fdwapi.h             |   7 +-
 7 files changed, 530 insertions(+), 205 deletions(-)

diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c
index 549821c..2344f01 100644
--- a/contrib/file_fdw/file_fdw.c
+++ b/contrib/file_fdw/file_fdw.c
@@ -19,6 +19,7 @@
 #include "access/reloptions.h"
 #include "access/sysattr.h"
 #include "access/table.h"
+#include "access/tableam.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_foreign_table.h"
 #include "commands/copy.h"
@@ -157,10 +158,8 @@ static void estimate_size(PlannerInfo *root, RelOptInfo *baserel,
 static void estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
 						   FileFdwPlanState *fdw_private,
 						   Cost *startup_cost, Cost *total_cost);
-static int	file_acquire_sample_rows(Relation onerel, int elevel,
-									 HeapTuple *rows, int targrows,
-									 double *totalrows, double *totaldeadrows);
-
+static void file_acquire_sample_rows(Relation onerel, int elevel,
+									 AnalyzeSampleContext *context);
 
 /*
  * Foreign-data wrapper handler function: return a struct with pointers
@@ -1091,14 +1090,16 @@ estimate_costs(PlannerInfo *root, RelOptInfo *baserel,
  * may be meaningless, but it's OK because we don't use the estimates
  * currently (the planner only pays attention to correlation for indexscans).
  */
-static int
+static void
 file_acquire_sample_rows(Relation onerel, int elevel,
-						 HeapTuple *rows, int targrows,
-						 double *totalrows, double *totaldeadrows)
+						 AnalyzeSampleContext *context)
 {
 	int			numrows = 0;
+	int			targrows = 0;
+	double		totalrows = 0;
 	double		rowstoskip = -1;	/* -1 means not set yet */
 	ReservoirStateData rstate;
+	HeapTuple	tuple;
 	TupleDesc	tupDesc;
 	Datum	   *values;
 	bool	   *nulls;
@@ -1111,6 +1112,8 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 	MemoryContext oldcontext = CurrentMemoryContext;
 	MemoryContext tupcontext;
 
+	targrows = context->targrows;
+
 	Assert(onerel);
 	Assert(targrows > 0);
 
@@ -1144,8 +1147,6 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 	errcallback.previous = error_context_stack;
 	error_context_stack = &errcallback;
 
-	*totalrows = 0;
-	*totaldeadrows = 0;
 	for (;;)
 	{
 		/* Check for user-requested abort or sleep */
@@ -1170,7 +1171,8 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 		 */
 		if (numrows < targrows)
 		{
-			rows[numrows++] = heap_form_tuple(tupDesc, values, nulls);
+			tuple = heap_form_tuple(tupDesc, values, nulls);
+			AnalyzeRecordSampleRow(context, NULL, tuple, ANALYZE_SAMPLE_DATA, numrows++, false /* replace */, false);
 		}
 		else
 		{
@@ -1180,7 +1182,7 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 			 * not-yet-incremented value of totalrows as t.
 			 */
 			if (rowstoskip < 0)
-				rowstoskip = reservoir_get_next_S(&rstate, *totalrows, targrows);
+				rowstoskip = reservoir_get_next_S(&rstate, totalrows, targrows);
 
 			if (rowstoskip <= 0)
 			{
@@ -1191,14 +1193,14 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 				int			k = (int) (targrows * sampler_random_fract(rstate.randstate));
 
 				Assert(k >= 0 && k < targrows);
-				heap_freetuple(rows[k]);
-				rows[k] = heap_form_tuple(tupDesc, values, nulls);
+				tuple = heap_form_tuple(tupDesc, values, nulls);
+				AnalyzeRecordSampleRow(context, NULL, tuple, ANALYZE_SAMPLE_DATA, k, true /* replace */, false);
 			}
 
 			rowstoskip -= 1;
 		}
 
-		*totalrows += 1;
+		totalrows += 1;
 	}
 
 	/* Remove error callback. */
@@ -1219,7 +1221,8 @@ file_acquire_sample_rows(Relation onerel, int elevel,
 			(errmsg("\"%s\": file contains %.0f rows; "
 					"%d rows in sample",
 					RelationGetRelationName(onerel),
-					*totalrows, numrows)));
+					totalrows, numrows)));
 
-	return numrows;
+	context->totalrows += totalrows;
+	context->totalsampledrows += numrows;
 }
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index bdc21b3..f0789cc 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -17,6 +17,7 @@
 #include "access/htup_details.h"
 #include "access/sysattr.h"
 #include "access/table.h"
+#include "access/tableam.h"
 #include "catalog/pg_class.h"
 #include "commands/defrem.h"
 #include "commands/explain.h"
@@ -237,7 +238,6 @@ typedef struct PgFdwAnalyzeState
 	List	   *retrieved_attrs;	/* attr numbers retrieved by query */
 
 	/* collected sample rows */
-	HeapTuple  *rows;			/* array of size targrows */
 	int			targrows;		/* target # of sample rows */
 	int			numrows;		/* # of sample rows collected */
 
@@ -463,12 +463,11 @@ static void process_query_params(ExprContext *econtext,
 								 FmgrInfo *param_flinfo,
 								 List *param_exprs,
 								 const char **param_values);
-static int	postgresAcquireSampleRowsFunc(Relation relation, int elevel,
-										  HeapTuple *rows, int targrows,
-										  double *totalrows,
-										  double *totaldeadrows);
+static void	postgresAcquireSampleRowsFunc(Relation relation, int elevel,
+										  AnalyzeSampleContext *context);
 static void analyze_row_processor(PGresult *res, int row,
-								  PgFdwAnalyzeState *astate);
+								  PgFdwAnalyzeState *astate,
+								  AnalyzeSampleContext *context);
 static HeapTuple make_tuple_from_result_row(PGresult *res,
 											int row,
 											Relation rel,
@@ -4488,11 +4487,9 @@ postgresAnalyzeForeignTable(Relation relation,
  * may be meaningless, but it's OK because we don't use the estimates
  * currently (the planner only pays attention to correlation for indexscans).
  */
-static int
+static void 
 postgresAcquireSampleRowsFunc(Relation relation, int elevel,
-							  HeapTuple *rows, int targrows,
-							  double *totalrows,
-							  double *totaldeadrows)
+							  AnalyzeSampleContext *context)
 {
 	PgFdwAnalyzeState astate;
 	ForeignTable *table;
@@ -4506,13 +4503,11 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 	/* Initialize workspace state */
 	astate.rel = relation;
 	astate.attinmeta = TupleDescGetAttInMetadata(RelationGetDescr(relation));
-
-	astate.rows = rows;
-	astate.targrows = targrows;
+	astate.targrows = context->targrows;
 	astate.numrows = 0;
 	astate.samplerows = 0;
 	astate.rowstoskip = -1;		/* -1 means not set yet */
-	reservoir_init_selection_state(&astate.rstate, targrows);
+	reservoir_init_selection_state(&astate.rstate, astate.targrows);
 
 	/* Remember ANALYZE context, and create a per-tuple temp context */
 	astate.anl_cxt = CurrentMemoryContext;
@@ -4604,7 +4599,7 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 			/* Process whatever we got. */
 			numrows = PQntuples(res);
 			for (i = 0; i < numrows; i++)
-				analyze_row_processor(res, i, &astate);
+				analyze_row_processor(res, i, &astate, context);
 
 			PQclear(res);
 			res = NULL;
@@ -4628,10 +4623,13 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 	ReleaseConnection(conn);
 
 	/* We assume that we have no dead tuple. */
-	*totaldeadrows = 0.0;
+	context->totaldeadrows = 0.0;
 
 	/* We've retrieved all living tuples from foreign server. */
-	*totalrows = astate.samplerows;
+	context->totalrows += astate.samplerows;
+
+	/* Increase the number of sample rows stored in the context */
+	context->totalsampledrows += astate.numrows;
 
 	/*
 	 * Emit some interesting relation info
@@ -4640,8 +4638,6 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
 			(errmsg("\"%s\": table contains %.0f rows, %d rows in sample",
 					RelationGetRelationName(relation),
 					astate.samplerows, astate.numrows)));
-
-	return astate.numrows;
 }
 
 /*
@@ -4650,10 +4646,11 @@ postgresAcquireSampleRowsFunc(Relation relation, int elevel,
  *	 - Subsequently, replace already-sampled tuples randomly.
  */
 static void
-analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
+analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate, AnalyzeSampleContext *context)
 {
 	int			targrows = astate->targrows;
 	int			pos;			/* array index to store tuple in */
+	bool		replace;
 	MemoryContext oldcontext;
 
 	/* Always increment sample row counter. */
@@ -4667,6 +4664,7 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
 	{
 		/* First targrows rows are always included into the sample */
 		pos = astate->numrows++;
+		replace = false;
 	}
 	else
 	{
@@ -4683,7 +4681,7 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
 			/* Choose a random reservoir element to replace. */
 			pos = (int) (targrows * sampler_random_fract(astate->rstate.randstate));
 			Assert(pos >= 0 && pos < targrows);
-			heap_freetuple(astate->rows[pos]);
+			replace = true;
 		}
 		else
 		{
@@ -4696,18 +4694,22 @@ analyze_row_processor(PGresult *res, int row, PgFdwAnalyzeState *astate)
 
 	if (pos >= 0)
 	{
+		HeapTuple		tuple;
 		/*
 		 * Create sample tuple from current result row, and store it in the
 		 * position determined above.  The tuple has to be created in anl_cxt.
 		 */
 		oldcontext = MemoryContextSwitchTo(astate->anl_cxt);
 
-		astate->rows[pos] = make_tuple_from_result_row(res, row,
-													   astate->rel,
-													   astate->attinmeta,
-													   astate->retrieved_attrs,
-													   NULL,
-													   astate->temp_cxt);
+		tuple = make_tuple_from_result_row(res, row,
+										   astate->rel,
+										   astate->attinmeta,
+										   astate->retrieved_attrs,
+										   NULL,
+										   astate->temp_cxt);
+
+		/* Tuple is already created in anl_cxt, we can record it directly */
+		AnalyzeRecordSampleRow(context, NULL, tuple, ANALYZE_SAMPLE_DATA, pos, replace, false);
 
 		MemoryContextSwitchTo(oldcontext);
 	}
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 253849e..c57c670 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -35,6 +35,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "parser/analyze.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/lmgr.h"
@@ -44,6 +45,7 @@
 #include "utils/builtins.h"
 #include "utils/rel.h"
 
+static int	compare_rows(const void *a, const void *b);
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
 									 Datum *values, bool *isnull, RewriteState rwstate);
@@ -974,10 +976,25 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	pfree(isnull);
 }
 
+static void
+heapam_scan_analyze_beginscan(Relation onerel, AnalyzeSampleContext *context)
+{
+	context->scan = table_beginscan_analyze(onerel);
+
+	/* initialize the totalblocks analyze can scan */
+	context->totalblocks = RelationGetNumberOfBlocks(onerel);
+
+	/* reset the statistic */
+	context->liverows = 0;
+	context->deadrows = 0;
+	context->ordered = true;
+}
+
 static bool
-heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
-							   BufferAccessStrategy bstrategy)
+heapam_scan_analyze_next_block(BlockNumber blockno,
+							   AnalyzeSampleContext *context)
 {
+	TableScanDesc scan = context->scan;
 	HeapScanDesc hscan = (HeapScanDesc) scan;
 
 	/*
@@ -992,7 +1009,7 @@ heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
 	hscan->rs_cblock = blockno;
 	hscan->rs_cindex = FirstOffsetNumber;
 	hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
-										blockno, RBM_NORMAL, bstrategy);
+										blockno, RBM_NORMAL, context->bstrategy);
 	LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
 
 	/* in heap all blocks can contain tuples, so always return true */
@@ -1000,14 +1017,14 @@ heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
 }
 
 static bool
-heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
-							   double *liverows, double *deadrows,
-							   TupleTableSlot *slot)
+heapam_scan_analyze_next_tuple(TransactionId OldestXmin, AnalyzeSampleContext *context)
 {
+	TableScanDesc scan = context->scan;
 	HeapScanDesc hscan = (HeapScanDesc) scan;
 	Page		targpage;
 	OffsetNumber maxoffset;
 	BufferHeapTupleTableSlot *hslot;
+	TupleTableSlot *slot = AnalyzeGetSampleSlot(context, scan->rs_rd, ANALYZE_SAMPLE_DATA);
 
 	Assert(TTS_IS_BUFFERTUPLE(slot));
 
@@ -1033,7 +1050,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 		if (!ItemIdIsNormal(itemid))
 		{
 			if (ItemIdIsDead(itemid))
-				*deadrows += 1;
+				context->deadrows += 1;
 			continue;
 		}
 
@@ -1048,13 +1065,13 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 		{
 			case HEAPTUPLE_LIVE:
 				sample_it = true;
-				*liverows += 1;
+				context->liverows += 1;
 				break;
 
 			case HEAPTUPLE_DEAD:
 			case HEAPTUPLE_RECENTLY_DEAD:
 				/* Count dead and recently-dead rows */
-				*deadrows += 1;
+				context->deadrows += 1;
 				break;
 
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1080,7 +1097,7 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 				if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
 				{
 					sample_it = true;
-					*liverows += 1;
+					context->liverows += 1;
 				}
 				break;
 
@@ -1109,11 +1126,11 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 				 * concurrent transaction never commits.
 				 */
 				if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
-					*deadrows += 1;
+					context->deadrows += 1;
 				else
 				{
 					sample_it = true;
-					*liverows += 1;
+					context->liverows += 1;
 				}
 				break;
 
@@ -1142,6 +1159,71 @@ heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 	return false;
 }
 
+static void 
+heapam_scan_analyze_sample_tuple(int pos, bool replace, AnalyzeSampleContext *context)
+{
+	TupleTableSlot *slot;
+	Relation onerel = context->scan->rs_rd;
+
+	Assert(pos >= 0);
+	/* 
+	 * heapam_scan_analyze_next_tuple should already put the tuple
+	 * in the sample slot, just record it into the array of sample
+	 * rows.
+	 */
+	slot = AnalyzeGetSampleSlot(context, onerel, ANALYZE_SAMPLE_DATA);
+	AnalyzeRecordSampleRow(context, slot, NULL, ANALYZE_SAMPLE_DATA, pos, replace, true);
+
+	/*
+	 * if replace happens, the sample rows are no longer ordered
+	 * in physical position. 
+	 */
+	if (replace)
+		context->ordered = false;
+}
+
+static void
+heapam_scan_analyze_endscan(AnalyzeSampleContext *context)
+{
+	HeapTuple *rows = AnalyzeGetSampleRows(context, ANALYZE_SAMPLE_DATA, context->totalsampledrows);
+
+	/*
+	 * If we didn't find as many tuples as we wanted then we're done. No sort
+	 * is needed, since they're already in order.
+	 *
+	 * Otherwise we need to sort the collected tuples by position
+	 * (itempointer).
+	 */
+	if (!context->ordered)
+		qsort((void *)rows, context->targrows, sizeof(HeapTuple), compare_rows);
+
+	table_endscan(context->scan);
+}
+
+/*
+ * qsort comparator for sorting rows[] array
+ */
+static int
+compare_rows(const void *a, const void *b)
+{
+	HeapTuple	ha = *(const HeapTuple *) a;
+	HeapTuple	hb = *(const HeapTuple *) b;
+	BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
+	OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
+	BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
+	OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
+
+	if (ba < bb)
+		return -1;
+	if (ba > bb)
+		return 1;
+	if (oa < ob)
+		return -1;
+	if (oa > ob)
+		return 1;
+	return 0;
+}
+
 static double
 heapam_index_build_range_scan(Relation heapRelation,
 							  Relation indexRelation,
@@ -2529,8 +2611,11 @@ static const TableAmRoutine heapam_methods = {
 	.relation_copy_data = heapam_relation_copy_data,
 	.relation_copy_for_cluster = heapam_relation_copy_for_cluster,
 	.relation_vacuum = heap_vacuum_rel,
+	.scan_analyze_beginscan = heapam_scan_analyze_beginscan,
 	.scan_analyze_next_block = heapam_scan_analyze_next_block,
 	.scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
+	.scan_analyze_sample_tuple = heapam_scan_analyze_sample_tuple,
+	.scan_analyze_endscan = heapam_scan_analyze_endscan,
 	.index_build_range_scan = heapam_index_build_range_scan,
 	.index_validate_scan = heapam_index_validate_scan,
 
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index b9ed336..d40eff0 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -23,7 +23,9 @@
 
 #include "access/heapam.h"		/* for ss_* */
 #include "access/tableam.h"
+#include "access/tupconvert.h"
 #include "access/xact.h"
+#include "catalog/pg_type.h"
 #include "optimizer/plancat.h"
 #include "storage/bufmgr.h"
 #include "storage/shmem.h"
@@ -650,3 +652,210 @@ table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
 	else
 		*allvisfrac = (double) relallvisible / curpages;
 }
+
+/* Create the analyze sample context to acquire sample rows */
+AnalyzeSampleContext *
+CreateAnalyzeSampleContext(Relation onerel,
+						   List *anl_cols,
+						   int totaltargrows,
+						   BufferAccessStrategy strategy)
+{
+	AnalyzeSampleContext *context;
+	
+	context = (AnalyzeSampleContext *) palloc(sizeof(AnalyzeSampleContext));
+	context->parent = onerel;
+	context->anl_cols = anl_cols;
+	context->bstrategy = strategy;
+	context->totaltargrows = totaltargrows;
+	context->targrows = totaltargrows;
+	context->scan = NULL;
+	context->totalblocks = 0;
+	context->totalrows = 0;
+	context->totaldeadrows = 0;
+	context->totalsampledrows = 0;
+	context->liverows = 0;
+	context->deadrows = 0;
+	context->ordered = false;
+	context->tup_convert_map = NULL;
+
+	/* empty all sample type */
+	memset(context->sample_slots, 0, MAX_ANALYZE_SAMPLE * sizeof(TupleTableSlot *));
+	memset(context->sample_rows, 0, MAX_ANALYZE_SAMPLE * sizeof(HeapTuple *));
+
+	return context;
+}
+
+/* Destroy analyze sample context */
+void
+DestroyAnalyzeSampleContext(AnalyzeSampleContext *context)
+{
+	for (int i = 0; i < MAX_ANALYZE_SAMPLE; i++)
+	{
+		TupleTableSlot *slot = context->sample_slots[i];
+		if (slot)
+			ExecDropSingleTupleTableSlot(slot);
+	}
+}
+
+/* 
+ * To acquire sample rows from an inherited table, all child
+ * relations use the same analyze sample context, this function
+ * must be called before starting analyze a new child relation.
+ */
+void
+InitAnalyzeSampleContextForChild(AnalyzeSampleContext *context,
+								 Relation child,
+								 int childtargrows)
+{
+	/* Set targrows to childtargrows */
+	context->targrows = childtargrows;
+
+	/* We may need to convert from child's rowtype to parent's */
+	if (!equalTupleDescs(RelationGetDescr(child),
+						 RelationGetDescr(context->parent)))
+	{
+		if (context->tup_convert_map)
+			free_conversion_map(context->tup_convert_map);
+		/* Create a convert map so it can be used when recording sample rows */
+		context->tup_convert_map =
+			convert_tuples_by_name(RelationGetDescr(child),
+								   RelationGetDescr(context->parent));
+
+		/* We also cannot use previous sample slot anymore */
+		if (context->sample_slots[ANALYZE_SAMPLE_DATA])
+		{
+			ExecDropSingleTupleTableSlot(context->sample_slots[ANALYZE_SAMPLE_DATA]);
+			context->sample_slots[ANALYZE_SAMPLE_DATA] = NULL;
+		}
+	}
+}
+
+void
+AnalyzeGetSampleStats(AnalyzeSampleContext *context,
+					  int *totalsampledrows,
+					  double *totalrows,
+					  double *totaldeadrows)
+{
+	if (totalsampledrows)
+		*totalsampledrows = context->totalsampledrows;
+	if (totalrows)
+		*totalrows = context->totalrows;
+	if (*totaldeadrows)
+		*totaldeadrows = context->totaldeadrows;
+}
+
+
+/* 
+ * Get or initialize a sample slot to hold sample tuple, normally
+ * the tuple in the slot will be copied to the sample_rows[type]
+ * by AnalyzeRecordSampleRow().
+ */
+TupleTableSlot *
+AnalyzeGetSampleSlot(AnalyzeSampleContext *context,
+					 Relation onerel,
+					 AnalyzeSampleType type)
+{
+	TupleDesc tupdesc;
+	int attr_cnt = onerel->rd_att->natts;
+
+	if (context->sample_slots[type])
+		return context->sample_slots[type]; 
+
+	switch (type)
+	{
+		case ANALYZE_SAMPLE_DATA:
+			tupdesc = RelationGetDescr(onerel);
+			break;
+		case ANALYZE_SAMPLE_DISKSIZE:
+			tupdesc = CreateTemplateTupleDesc(attr_cnt);
+			for (int i = 1; i <= attr_cnt; i++)
+				TupleDescInitEntry(tupdesc, i, "", FLOAT8OID, -1, 0);
+			break;
+		default:
+			elog(ERROR, "unknown analyze sample type");
+	}
+
+	context->sample_slots[type] =
+		MakeSingleTupleTableSlot(tupdesc, table_slot_callbacks(onerel));
+	return context->sample_slots[type];
+}
+
+HeapTuple *
+AnalyzeGetSampleRows(AnalyzeSampleContext *context,
+					 AnalyzeSampleType type,
+					 int offset)
+{
+	Assert(offset < context->totaltargrows);
+	if (!context->sample_rows[type])
+		context->sample_rows[type] =
+			(HeapTuple *) palloc(context->totaltargrows * sizeof(HeapTuple));
+
+	return context->sample_rows[type] + offset;
+}
+
+/*
+ * Record a sample tuple into sample_rows[type].
+ * 
+ * sample_tuple:
+ * 		Input sample tuple. Sometimes, callers has already
+ * 		formed sample tuple in its memory context, we can
+ * 		record it directly. 
+ * sample_slot: 
+ * 		Slot which contains the sample tuple. We need to copy
+ * 		the sample tuple and then record it.
+ * pos:
+ * 		The postion in the sample_rows[type].
+ * replace:
+ * 		Replace the old sample tuple in the specified position.
+ * withtid:
+ * 		Set the tid of sample tuple, this is only valid when
+ * 		sample_slot is set.
+ *
+ * We prefer to use sample_slot if both sample_tuple and
+ * sample_slot are set, sample_slot is the most common case. 
+ */
+void
+AnalyzeRecordSampleRow(AnalyzeSampleContext *context,
+					   TupleTableSlot *sample_slot,
+					   HeapTuple sample_tuple,
+					   AnalyzeSampleType type,
+					   int pos,
+					   bool replace,
+					   bool withtid)
+{
+	HeapTuple tuple;
+	HeapTuple *rows;
+
+	rows = AnalyzeGetSampleRows(context, type, context->totalsampledrows);
+
+	/* We need to free the old tuple if replace is true */
+	if (replace)
+		heap_freetuple(rows[pos]);
+
+	Assert(sample_slot || sample_tuple);
+	if (sample_slot)
+		tuple = ExecCopySlotHeapTuple(sample_slot);
+	else
+		tuple = sample_tuple;
+
+	/* We may need to convert from child's rowtype to parent's */
+	if (context->tup_convert_map != NULL)
+	{
+		HeapTuple	newtup;
+		newtup = execute_attr_map_tuple(tuple, context->tup_convert_map);
+		heap_freetuple(tuple);
+		tuple = newtup;
+	}
+
+	if (withtid && sample_slot)
+		tuple->t_self = sample_slot->tts_tid;
+
+	/* store the tuple to right position */
+	rows[pos] = tuple;
+}
+
+bool
+AnalyzeSampleIsValid(AnalyzeSampleContext *context, AnalyzeSampleType type)
+{
+	return context->sample_rows[type] != NULL;
+}
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e2033f9..cc1649d 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -83,7 +83,6 @@ int			default_statistics_target = 100;
 static MemoryContext anl_context = NULL;
 static BufferAccessStrategy vac_strategy;
 
-
 static void do_analyze_rel(Relation onerel,
 						   VacuumParams *params, List *va_cols,
 						   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
@@ -94,13 +93,10 @@ static void compute_index_stats(Relation onerel, double totalrows,
 								MemoryContext col_context);
 static VacAttrStats *examine_attribute(Relation onerel, int attnum,
 									   Node *index_expr);
-static int	acquire_sample_rows(Relation onerel, int elevel,
-								HeapTuple *rows, int targrows,
-								double *totalrows, double *totaldeadrows);
-static int	compare_rows(const void *a, const void *b);
-static int	acquire_inherited_sample_rows(Relation onerel, int elevel,
-										  HeapTuple *rows, int targrows,
-										  double *totalrows, double *totaldeadrows);
+static void	acquire_sample_rows(Relation onerel, int elevel,
+								AnalyzeSampleContext *context);
+static void	acquire_inherited_sample_rows(Relation onerel, int elevel,
+										  AnalyzeSampleContext *context);
 static void update_attstats(Oid relid, bool inh,
 							int natts, VacAttrStats **vacattrstats);
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
@@ -318,6 +314,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	Oid			save_userid;
 	int			save_sec_context;
 	int			save_nestlevel;
+	AnalyzeSampleContext *sample_context;
 
 	if (inh)
 		ereport(elevel,
@@ -502,18 +499,21 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	if (targrows < minrows)
 		targrows = minrows;
 
+	/* create context for acquiring sample rows */
+	sample_context = CreateAnalyzeSampleContext(onerel, va_cols, targrows,
+												vac_strategy);
+
 	/*
 	 * Acquire the sample rows
 	 */
-	rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple));
 	if (inh)
-		numrows = acquire_inherited_sample_rows(onerel, elevel,
-												rows, targrows,
-												&totalrows, &totaldeadrows);
+		acquire_inherited_sample_rows(onerel, elevel, sample_context);
 	else
-		numrows = (*acquirefunc) (onerel, elevel,
-								  rows, targrows,
-								  &totalrows, &totaldeadrows);
+		(*acquirefunc) (onerel, elevel, sample_context); 
+
+	/* Get the sample statistics */
+	AnalyzeGetSampleStats(sample_context, &numrows, &totalrows, &totaldeadrows);
+	rows = AnalyzeGetSampleRows(sample_context, ANALYZE_SAMPLE_DATA, 0);
 
 	/*
 	 * Compute the statistics.  Temporary results during the calculations for
@@ -592,7 +592,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 		 * not for relations representing inheritance trees.
 		 */
 		if (!inh)
-			BuildRelationExtStatistics(onerel, totalrows, numrows, rows,
+			BuildRelationExtStatistics(onerel, totalrows, numrows,
+									   rows,
 									   attr_cnt, vacattrstats);
 	}
 
@@ -690,6 +691,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 							pg_rusage_show(&ru0))));
 	}
 
+	DestroyAnalyzeSampleContext(sample_context);
+
 	/* Roll back any GUC changes executed by index functions */
 	AtEOXact_GUC(false, save_nestlevel);
 
@@ -1018,26 +1021,26 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr)
  * block.  The previous sampling method put too much credence in the row
  * density near the start of the table.
  */
-static int
+static void 
 acquire_sample_rows(Relation onerel, int elevel,
-					HeapTuple *rows, int targrows,
-					double *totalrows, double *totaldeadrows)
+					AnalyzeSampleContext *context)
 {
 	int			numrows = 0;	/* # rows now in reservoir */
+	int			targrows = context->targrows;
 	double		samplerows = 0; /* total # rows collected */
-	double		liverows = 0;	/* # live rows seen */
-	double		deadrows = 0;	/* # dead rows seen */
 	double		rowstoskip = -1;	/* -1 means not set yet */
+	double		totalrows = 0;
+	double		totaldeadrows = 0;
 	BlockNumber totalblocks;
 	TransactionId OldestXmin;
 	BlockSamplerData bs;
 	ReservoirStateData rstate;
-	TupleTableSlot *slot;
-	TableScanDesc scan;
 
 	Assert(targrows > 0);
 
-	totalblocks = RelationGetNumberOfBlocks(onerel);
+	table_scan_analyze_beginscan(onerel, context);
+
+	totalblocks = context->totalblocks;
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
 	OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM);
@@ -1047,9 +1050,6 @@ acquire_sample_rows(Relation onerel, int elevel,
 	/* Prepare for sampling rows */
 	reservoir_init_selection_state(&rstate, targrows);
 
-	scan = table_beginscan_analyze(onerel);
-	slot = table_slot_create(onerel, NULL);
-
 	/* Outer loop over blocks to sample */
 	while (BlockSampler_HasMore(&bs))
 	{
@@ -1057,10 +1057,10 @@ acquire_sample_rows(Relation onerel, int elevel,
 
 		vacuum_delay_point();
 
-		if (!table_scan_analyze_next_block(scan, targblock, vac_strategy))
+		if (!table_scan_analyze_next_block(targblock, context))
 			continue;
 
-		while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot))
+		while (table_scan_analyze_next_tuple(OldestXmin, context))
 		{
 			/*
 			 * The first targrows sample rows are simply copied into the
@@ -1076,8 +1076,8 @@ acquire_sample_rows(Relation onerel, int elevel,
 			 */
 			if (numrows < targrows)
 			{
-				rows[numrows] = ExecCopySlotHeapTuple(slot);
-				rows[numrows]->t_self = slot->tts_tid;
+				table_scan_analyze_sample_tuple(numrows, false, context);
+
 				numrows++;
 			}
 			else
@@ -1099,9 +1099,8 @@ acquire_sample_rows(Relation onerel, int elevel,
 					int			k = (int) (targrows * sampler_random_fract(rstate.randstate));
 
 					Assert(k >= 0 && k < targrows);
-					heap_freetuple(rows[k]);
-					rows[k] = ExecCopySlotHeapTuple(slot);
-					rows[k]->t_self = slot->tts_tid;
+
+					table_scan_analyze_sample_tuple(k, true, context);
 				}
 
 				rowstoskip -= 1;
@@ -1111,19 +1110,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 		}
 	}
 
-	ExecDropSingleTupleTableSlot(slot);
-	table_endscan(scan);
-
-	/*
-	 * If we didn't find as many tuples as we wanted then we're done. No sort
-	 * is needed, since they're already in order.
-	 *
-	 * Otherwise we need to sort the collected tuples by position
-	 * (itempointer). It's not worth worrying about corner cases where the
-	 * tuples are already sorted.
-	 */
-	if (numrows == targrows)
-		qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
+	table_scan_analyze_endscan(context);
 
 	/*
 	 * Estimate total numbers of live and dead rows in relation, extrapolating
@@ -1134,13 +1121,13 @@ acquire_sample_rows(Relation onerel, int elevel,
 	 */
 	if (bs.m > 0)
 	{
-		*totalrows = floor((liverows / bs.m) * totalblocks + 0.5);
-		*totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5);
+		totalrows = floor((context->liverows / bs.m) * totalblocks + 0.5);
+		totaldeadrows = floor((context->deadrows / bs.m) * totalblocks + 0.5);
 	}
 	else
 	{
-		*totalrows = 0.0;
-		*totaldeadrows = 0.0;
+		totalrows = 0.0;
+		totaldeadrows = 0.0;
 	}
 
 	/*
@@ -1152,34 +1139,13 @@ acquire_sample_rows(Relation onerel, int elevel,
 					"%d rows in sample, %.0f estimated total rows",
 					RelationGetRelationName(onerel),
 					bs.m, totalblocks,
-					liverows, deadrows,
-					numrows, *totalrows)));
+					context->liverows,
+					context->deadrows,
+					numrows, totalrows)));
 
-	return numrows;
-}
-
-/*
- * qsort comparator for sorting rows[] array
- */
-static int
-compare_rows(const void *a, const void *b)
-{
-	HeapTuple	ha = *(const HeapTuple *) a;
-	HeapTuple	hb = *(const HeapTuple *) b;
-	BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self);
-	OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self);
-	BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self);
-	OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self);
-
-	if (ba < bb)
-		return -1;
-	if (ba > bb)
-		return 1;
-	if (oa < ob)
-		return -1;
-	if (oa > ob)
-		return 1;
-	return 0;
+	context->totalrows += totalrows;
+	context->totaldeadrows += totaldeadrows;
+	context->totalsampledrows += numrows;
 }
 
 
@@ -1191,18 +1157,16 @@ compare_rows(const void *a, const void *b)
  * We fail and return zero if there are no inheritance children, or if all
  * children are foreign tables that don't support ANALYZE.
  */
-static int
+static void
 acquire_inherited_sample_rows(Relation onerel, int elevel,
-							  HeapTuple *rows, int targrows,
-							  double *totalrows, double *totaldeadrows)
+							  AnalyzeSampleContext *context)
 {
 	List	   *tableOIDs;
 	Relation   *rels;
 	AcquireSampleRowsFunc *acquirefuncs;
 	double	   *relblocks;
 	double		totalblocks;
-	int			numrows,
-				nrels,
+	int			nrels,
 				i;
 	ListCell   *lc;
 	bool		has_child;
@@ -1230,7 +1194,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 				(errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables",
 						get_namespace_name(RelationGetNamespace(onerel)),
 						RelationGetRelationName(onerel))));
-		return 0;
+		return;
 	}
 
 	/*
@@ -1328,7 +1292,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 				(errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no analyzable child tables",
 						get_namespace_name(RelationGetNamespace(onerel)),
 						RelationGetRelationName(onerel))));
-		return 0;
+		return;
 	}
 
 	/*
@@ -1337,9 +1301,6 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 	 * rels have radically different free-space percentages, but it's not
 	 * clear that it's worth working harder.)
 	 */
-	numrows = 0;
-	*totalrows = 0;
-	*totaldeadrows = 0;
 	for (i = 0; i < nrels; i++)
 	{
 		Relation	childrel = rels[i];
@@ -1350,49 +1311,15 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 		{
 			int			childtargrows;
 
-			childtargrows = (int) rint(targrows * childblocks / totalblocks);
+			childtargrows = (int) rint(context->totaltargrows * childblocks / totalblocks);
 			/* Make sure we don't overrun due to roundoff error */
-			childtargrows = Min(childtargrows, targrows - numrows);
+			childtargrows = Min(childtargrows, context->totaltargrows - context->totalsampledrows);
 			if (childtargrows > 0)
 			{
-				int			childrows;
-				double		trows,
-							tdrows;
+				InitAnalyzeSampleContextForChild(context, childrel, childtargrows);
 
 				/* Fetch a random sample of the child's rows */
-				childrows = (*acquirefunc) (childrel, elevel,
-											rows + numrows, childtargrows,
-											&trows, &tdrows);
-
-				/* We may need to convert from child's rowtype to parent's */
-				if (childrows > 0 &&
-					!equalTupleDescs(RelationGetDescr(childrel),
-									 RelationGetDescr(onerel)))
-				{
-					TupleConversionMap *map;
-
-					map = convert_tuples_by_name(RelationGetDescr(childrel),
-												 RelationGetDescr(onerel));
-					if (map != NULL)
-					{
-						int			j;
-
-						for (j = 0; j < childrows; j++)
-						{
-							HeapTuple	newtup;
-
-							newtup = execute_attr_map_tuple(rows[numrows + j], map);
-							heap_freetuple(rows[numrows + j]);
-							rows[numrows + j] = newtup;
-						}
-						free_conversion_map(map);
-					}
-				}
-
-				/* And add to counts */
-				numrows += childrows;
-				*totalrows += trows;
-				*totaldeadrows += tdrows;
+				(*acquirefunc) (childrel, elevel, context);
 			}
 		}
 
@@ -1402,8 +1329,6 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
 		 */
 		table_close(childrel, NoLock);
 	}
-
-	return numrows;
 }
 
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 0b882dc..90d2375 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -37,6 +37,66 @@ struct SampleScanState;
 struct TBMIterateResult;
 struct VacuumParams;
 struct ValidateIndexState;
+struct TupleConversionMap;
+
+typedef enum AnalyzeSampleType 
+{
+	ANALYZE_SAMPLE_DATA = 0,	/* real data per column */
+	ANALYZE_SAMPLE_DISKSIZE,	/* physical size per column */
+	MAX_ANALYZE_SAMPLE			/* must be last */
+} AnalyzeSampleType;
+
+typedef struct AnalyzeSampleContext
+{
+	/* Filled when context is created */
+	int		totaltargrows;
+	List	*anl_cols;
+	Relation parent;
+	BufferAccessStrategy bstrategy;
+
+	/* Filled by table AM analyze routines */
+	BlockNumber	totalblocks;
+	TableScanDesc scan;
+
+	/* 
+	 * Acquiring sample rows from a inherited table will invoke
+	 * multiple sampling iterations for each child relation, so
+	 * bellow filed is the statistic for each iteration.
+	 */
+	int		targrows;	/* target number of sample rows */
+	double 	liverows;
+	double 	deadrows;
+	bool	ordered;	/* are sample rows ordered physically */
+
+	/*
+	 * Statistics filed by all sampling iterations.
+	 */
+	int		totalsampledrows; /* total number of sample rows stored */
+	double	totalrows;
+	double	totaldeadrows;
+
+	/* 
+	 * If childrel has different rowtype with parent, we
+	 * need to convert sample tuple to the same rowtype
+	 * with parent
+	 */
+	struct TupleConversionMap *tup_convert_map;
+
+	/*
+	 * Used by table AM analyze routines to store
+	 * the temporary tuple for different types of
+	 * sample rows, the tuple is finally stored to
+	 * sample_rows[] if the tuple is
+	 * randomly selected.
+	 */
+	TupleTableSlot* sample_slots[MAX_ANALYZE_SAMPLE];
+
+	/* 
+	 * stores the final sample rows which will be
+	 * used to compute statistics.
+	 */
+	HeapTuple* sample_rows[MAX_ANALYZE_SAMPLE];
+} AnalyzeSampleContext;
 
 /*
  * Bitmask values for the flags argument to the scan_begin callback.
@@ -532,9 +592,10 @@ typedef struct TableAmRoutine
 	 * clear what a good interface for non block based AMs would be, so there
 	 * isn't one yet.
 	 */
-	bool		(*scan_analyze_next_block) (TableScanDesc scan,
-											BlockNumber blockno,
-											BufferAccessStrategy bstrategy);
+	void		(*scan_analyze_beginscan) (Relation onerel, AnalyzeSampleContext *context);
+
+	bool		(*scan_analyze_next_block) (BlockNumber blockno,
+											AnalyzeSampleContext *context);
 
 	/*
 	 * See table_scan_analyze_next_tuple().
@@ -544,11 +605,13 @@ typedef struct TableAmRoutine
 	 * influence autovacuum scheduling (see comment for relation_vacuum
 	 * callback).
 	 */
-	bool		(*scan_analyze_next_tuple) (TableScanDesc scan,
-											TransactionId OldestXmin,
-											double *liverows,
-											double *deadrows,
-											TupleTableSlot *slot);
+	bool		(*scan_analyze_next_tuple) (TransactionId OldestXmin,
+											AnalyzeSampleContext *context);
+
+	void		(*scan_analyze_sample_tuple) (int pos, bool replace,
+											  AnalyzeSampleContext *context);
+
+	void		(*scan_analyze_endscan) (AnalyzeSampleContext *context);
 
 	/* see table_index_build_range_scan for reference about parameters */
 	double		(*index_build_range_scan) (Relation table_rel,
@@ -1474,6 +1537,12 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params,
 	rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
 }
 
+static inline void
+table_scan_analyze_beginscan(Relation rel, struct AnalyzeSampleContext *context)
+{
+	rel->rd_tableam->scan_analyze_beginscan(rel, context);
+}
+
 /*
  * Prepare to analyze block `blockno` of `scan`. The scan needs to have been
  * started with table_beginscan_analyze().  Note that this routine might
@@ -1483,11 +1552,10 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params,
  * Returns false if block is unsuitable for sampling, true otherwise.
  */
 static inline bool
-table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
-							  BufferAccessStrategy bstrategy)
+table_scan_analyze_next_block(BlockNumber blockno,
+							  struct AnalyzeSampleContext *context)
 {
-	return scan->rs_rd->rd_tableam->scan_analyze_next_block(scan, blockno,
-															bstrategy);
+	return context->scan->rs_rd->rd_tableam->scan_analyze_next_block(blockno, context);
 }
 
 /*
@@ -1501,13 +1569,21 @@ table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
  * tuples.
  */
 static inline bool
-table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
-							  double *liverows, double *deadrows,
-							  TupleTableSlot *slot)
+table_scan_analyze_next_tuple(TransactionId OldestXmin, AnalyzeSampleContext *context)
+{
+	return context->scan->rs_rd->rd_tableam->scan_analyze_next_tuple(OldestXmin, context);
+}
+
+static inline void 
+table_scan_analyze_sample_tuple(Index sample, bool replace, AnalyzeSampleContext *context)
+{
+	context->scan->rs_rd->rd_tableam->scan_analyze_sample_tuple(sample, replace, context);
+}
+
+static inline void
+table_scan_analyze_endscan(AnalyzeSampleContext *context)
 {
-	return scan->rs_rd->rd_tableam->scan_analyze_next_tuple(scan, OldestXmin,
-															liverows, deadrows,
-															slot);
+	context->scan->rs_rd->rd_tableam->scan_analyze_endscan(context);
 }
 
 /*
@@ -1783,6 +1859,32 @@ extern void table_block_relation_estimate_size(Relation rel,
 											   Size usable_bytes_per_page);
 
 /* ----------------------------------------------------------------------------
+ * Helper functions to implement analyze scan. 
+j* ----------------------------------------------------------------------------
+ */
+extern AnalyzeSampleContext *
+CreateAnalyzeSampleContext(Relation onerel, List *cols, int targrows,
+						   BufferAccessStrategy strategy);
+extern void DestroyAnalyzeSampleContext(AnalyzeSampleContext *context);
+extern TupleTableSlot * AnalyzeGetSampleSlot(AnalyzeSampleContext *context,
+											 Relation onerel, AnalyzeSampleType type);
+extern void AnalyzeRecordSampleRow(AnalyzeSampleContext *context,
+								   TupleTableSlot *sample_slot,
+								   HeapTuple sample_tuple,
+								   AnalyzeSampleType type, int pos,
+								   bool replace, bool withtid);
+extern void InitAnalyzeSampleContextForChild(AnalyzeSampleContext *context,
+											 Relation child,
+											 int childtargrows);
+extern void AnalyzeGetSampleStats(AnalyzeSampleContext *context,
+								  int *totalsampledrows,
+								  double *totalrows,
+								  double *totaldeadrows);
+extern HeapTuple *
+AnalyzeGetSampleRows(AnalyzeSampleContext *context, AnalyzeSampleType type, int offset);
+extern bool AnalyzeSampleIsValid(AnalyzeSampleContext *context, AnalyzeSampleType type);
+
+/* ----------------------------------------------------------------------------
  * Functions in tableamapi.c
  * ----------------------------------------------------------------------------
  */
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index 8226860..e0da119 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -18,6 +18,7 @@
 
 /* To avoid including explain.h here, reference ExplainState thus: */
 struct ExplainState;
+struct AnalyzeSampleContext;
 
 
 /*
@@ -139,10 +140,8 @@ typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate,
 typedef void (*ExplainDirectModify_function) (ForeignScanState *node,
 											  struct ExplainState *es);
 
-typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel,
-									  HeapTuple *rows, int targrows,
-									  double *totalrows,
-									  double *totaldeadrows);
+typedef void (*AcquireSampleRowsFunc) (Relation relation, int elevel,
+									   struct AnalyzeSampleContext *context);
 
 typedef bool (*AnalyzeForeignTable_function) (Relation relation,
 											  AcquireSampleRowsFunc *func,
-- 
1.8.3.1

From f347347cff55b7e12d7031be10b7d1fd4f4f3ea0 Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:43:33 -0500
Subject: [PATCH 2/3] Planner can estimate the pages based on the columns
 selected

Planner used to assume we need to scan all the pages even we
only need one or two columns in a query, this is right for
heap tables, however, if we using a column store like
zedstore, we can optimize the number of pages with only
selected columns, this will reduce the IO cost and the number
of parallel workers in some cases.

To do this, this commit added a new field `stadiskfrac` in
catalog `pg_statistic`, it records the fraction of physical
size that a column used comparing to the whole table. planer
will calculate a pages selectivity based on the targetlist
and baserestriction info, then scale it with the rel->pages
got from estimate_rel_size().
---
 src/backend/commands/analyze.c        | 52 +++++++++++++++++++++++++++++++++++
 src/backend/optimizer/path/allpaths.c | 45 ++++++++++++++++++++++++++++++
 src/include/catalog/catversion.h      |  2 +-
 src/include/catalog/pg_statistic.h    |  3 ++
 src/include/commands/vacuum.h         |  6 ++++
 src/include/nodes/parsenodes.h        |  1 +
 6 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index cc1649d..9a8ae36 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -87,6 +87,9 @@ static void do_analyze_rel(Relation onerel,
 						   VacuumParams *params, List *va_cols,
 						   AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
 						   bool inh, bool in_outer_xact, int elevel);
+static void compute_disk_stats(VacAttrStats **stats, int natts,
+							   TupleDesc desc, HeapTuple *rows,
+							   int numrows);
 static void compute_index_stats(Relation onerel, double totalrows,
 								AnlIndexData *indexdata, int nindexes,
 								HeapTuple *rows, int numrows,
@@ -560,6 +563,19 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 			MemoryContextResetAndDeleteChildren(col_context);
 		}
 
+		/* compute disksize ratio stats if any */
+		if (AnalyzeSampleIsValid(sample_context, ANALYZE_SAMPLE_DISKSIZE))
+		{
+			TupleTableSlot *slot =
+				AnalyzeGetSampleSlot(sample_context, onerel, ANALYZE_SAMPLE_DISKSIZE);
+			HeapTuple *rows =
+				AnalyzeGetSampleRows(sample_context, ANALYZE_SAMPLE_DISKSIZE, 0);
+
+			compute_disk_stats(vacattrstats, attr_cnt,
+							   slot->tts_tupleDescriptor,
+							   rows, numrows);
+		}
+
 		if (hasindex)
 			compute_index_stats(onerel, totalrows,
 								indexdata, nindexes,
@@ -705,6 +721,41 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	anl_context = NULL;
 }
 
+static void
+compute_disk_stats(VacAttrStats **stats, int natts,
+				   TupleDesc desc, HeapTuple *rows,
+				   int numrows)
+{
+	int		i, j;
+	float8	attr_size = 0;
+	float8	total = 0;
+	bool	isNull;
+
+	for (i = 0; i < numrows; i++)
+	{
+		HeapTuple tup = rows[i];
+
+		for (j = 0; j < natts; j++)
+		{
+			VacAttrStats *vac = stats[j];
+			Datum dat = heap_getattr(tup, j + 1, desc, &isNull);
+
+			if (!isNull)
+			{
+				attr_size = DatumGetFloat8(dat);
+				vac->disksize += attr_size;
+				total += attr_size;
+			}
+		}
+	}
+
+	for (j = 0; j < natts; j++)
+	{
+		VacAttrStats *vac = stats[j];
+		vac->stadiskfrac = vac->disksize / total;
+	}
+}
+
 /*
  * Compute statistics about indexes of a relation
  */
@@ -1394,6 +1445,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
 		values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(stats->attr->attnum);
 		values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh);
 		values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
+		values[Anum_pg_statistic_stadiskfrac - 1] = Float4GetDatum(stats->stadiskfrac);
 		values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
 		values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
 		i = Anum_pg_statistic_stakind1 - 1;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index db3a68a..debb116 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -23,6 +23,7 @@
 #include "catalog/pg_class.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_proc.h"
+#include "catalog/pg_statistic.h"
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -47,6 +48,7 @@
 #include "partitioning/partbounds.h"
 #include "partitioning/partprune.h"
 #include "rewrite/rewriteManip.h"
+#include "utils/syscache.h"
 #include "utils/lsyscache.h"
 
 
@@ -80,6 +82,9 @@ static void set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 							 Index rti, RangeTblEntry *rte);
 static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel,
 							   RangeTblEntry *rte);
+static void set_plain_rel_page_estimates(PlannerInfo *root,
+										 RelOptInfo *rel,
+										 RangeTblEntry *rte);
 static void create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel);
 static void set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
 									  RangeTblEntry *rte);
@@ -581,6 +586,46 @@ set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 
 	/* Mark rel with estimated output rows, width, etc */
 	set_baserel_size_estimates(root, rel);
+
+	/* Estimate the pages based on the selected columns */
+	set_plain_rel_page_estimates(root, rel, rte);
+}
+
+static void
+set_plain_rel_page_estimates(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
+{
+	double		pages;
+	HeapTuple	tp;
+	AttrNumber	attno;
+	Selectivity sel = 0;
+
+	if (!rte->scanCols)
+		return;
+
+	attno = -1;
+	while ((attno = bms_next_member(rte->scanCols, attno)) >= 0)
+	{
+		tp = SearchSysCache3(STATRELATTINH,
+							 ObjectIdGetDatum(rte->relid),
+							 Int16GetDatum(attno),
+							 BoolGetDatum(rte->inh));
+
+		if (HeapTupleIsValid(tp))
+		{
+			sel += ((Form_pg_statistic) GETSTRUCT(tp))->stadiskfrac;
+			ReleaseSysCache(tp);
+		}
+	}
+
+	if (sel > 0)
+	{
+		pages = rel->pages * sel;
+
+		if (pages <= 1.0)
+			rel->pages = 1;
+		else
+			rel->pages = rint(pages);
+	}
 }
 
 /*
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 304d136..12d2494 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201912061
+#define CATALOG_VERSION_NO	202002141
 
 #endif
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 207be54..66029f6 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -36,6 +36,9 @@ CATALOG(pg_statistic,2619,StatisticRelationId)
 	/* the fraction of the column's entries that are NULL: */
 	float4		stanullfrac;
 
+	/* the fraction of the column's disksize of all columns */
+	float4		stadiskfrac;
+
 	/*
 	 * stawidth is the average width in bytes of non-null entries.  For
 	 * fixed-width datatypes this is of course the same as the typlen, but for
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 128f7ae..077a3c1 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -114,6 +114,12 @@ typedef struct VacAttrStats
 	Datum	   *stavalues[STATISTIC_NUM_SLOTS];
 
 	/*
+	 * These fields are to be filled in compute_disk_stats
+	 */
+	float4		stadiskfrac;	/* fraction of the physical size */
+	float8		disksize;		/* value of the physical size */
+
+	/*
 	 * These fields describe the stavalues[n] element types. They will be
 	 * initialized to match attrtypid, but a custom typanalyze function might
 	 * want to store an array of something other than the analyzed column's
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index ff626cb..ddb0b7d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1100,6 +1100,7 @@ typedef struct RangeTblEntry
 	Bitmapset  *updatedCols;	/* columns needing UPDATE permission */
 	Bitmapset  *extraUpdatedCols;	/* generated columns being updated */
 	List	   *securityQuals;	/* security barrier quals to apply, if any */
+	Bitmapset *scanCols;
 } RangeTblEntry;
 
 /*
-- 
1.8.3.1

From 471b1ba4bb704aac3d6128263ac2dbab103c13e8 Mon Sep 17 00:00:00 2001
From: Pengzhou Tang <ptang@pivotal.io>
Date: Wed, 20 Nov 2019 06:59:22 -0500
Subject: [PATCH 3/3] ZedStore use extended ANAlYZE API

1) use the logical block ID in ANALYZE
2) provide disksize info per column when ANALYZE, so
   planner can estimate the pages need to scan based
   on columns selected.
3) can only analyze the columns specified
---
 src/backend/access/zedstore/zedstore_attstream.c |   7 +-
 src/backend/access/zedstore/zedstoream_handler.c | 118 ++++++++++++++++++++---
 src/include/access/zedstore_internal.h           |   4 +
 3 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/zedstore/zedstore_attstream.c b/src/backend/access/zedstore/zedstore_attstream.c
index b659c95..7a1a1a9 100644
--- a/src/backend/access/zedstore/zedstore_attstream.c
+++ b/src/backend/access/zedstore/zedstore_attstream.c
@@ -167,6 +167,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 					  attstream->t_size - SizeOfZSAttStreamHeader,
 					  attstream->t_decompressed_bufsize);
 		decoder->chunks_len = attstream->t_decompressed_size;
+		decoder->compression_ratio = ((float8) buf_size_needed) / attstream->t_size;
 	}
 	else
 	{
@@ -174,6 +175,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 			   ((char *) attstream) + SizeOfZSAttStreamHeader,
 			   attstream->t_size - SizeOfZSAttStreamHeader);
 		decoder->chunks_len = attstream->t_size - SizeOfZSAttStreamHeader;
+		decoder->compression_ratio = 1.0;
 	}
 	decoder->firsttid = get_chunk_first_tid(decoder->attlen, decoder->chunks_buf);
 	decoder->lasttid = attstream->t_lasttid;
@@ -182,6 +184,7 @@ decode_attstream_begin(attstream_decoder *decoder, ZSAttStream *attstream)
 	decoder->prevtid = 0;
 
 	decoder->num_elements = 0;
+	decoder->avg_elements_size = 0;
 }
 
 /*
@@ -227,6 +230,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 	zstid		lasttid;
 	int			total_decoded;
 	char	   *p;
+	char	   *lastp;
 	char	   *pend;
 	MemoryContext oldcxt;
 
@@ -237,7 +241,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 		MemoryContextSwitchTo(decoder->tmpcxt);
 	}
 
-	p = decoder->chunks_buf + decoder->pos;
+	lastp = p = decoder->chunks_buf + decoder->pos;
 	pend = decoder->chunks_buf + decoder->chunks_len;
 
 	total_decoded = 0;
@@ -262,6 +266,7 @@ decode_attstream_cont(attstream_decoder *decoder)
 
 	Assert(p <= pend);
 	decoder->num_elements = total_decoded;
+	decoder->avg_elements_size = ((p - lastp) / total_decoded) / decoder->compression_ratio;
 	decoder->pos = p - decoder->chunks_buf;
 	if (total_decoded > 0)
 	{
diff --git a/src/backend/access/zedstore/zedstoream_handler.c b/src/backend/access/zedstore/zedstoream_handler.c
index 0b59191..e844a31 100644
--- a/src/backend/access/zedstore/zedstoream_handler.c
+++ b/src/backend/access/zedstore/zedstoream_handler.c
@@ -35,6 +35,7 @@
 #include "miscadmin.h"
 #include "optimizer/plancat.h"
 #include "pgstat.h"
+#include "parser/parse_relation.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/procarray.h"
@@ -2420,34 +2421,110 @@ zedstoream_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 	zsbt_tuplebuffer_flush(NewHeap);
 }
 
+static void 
+zedstoream_scan_analyze_beginscan(Relation onerel, AnalyzeSampleContext *context)
+{
+	zstid 	tid;
+	List	*va_cols = context->anl_cols;		
+	Bitmapset	*project_columns = NULL;	
+
+	/* zedstore can sample rows on specified columns only */
+	if (!va_cols)
+		context->scan = table_beginscan_analyze(onerel);
+	else
+	{
+		ListCell	*le;
+
+		foreach(le, va_cols)
+		{
+			char	   *col = strVal(lfirst(le));
+
+			project_columns =
+				bms_add_member(project_columns, attnameAttNum(onerel, col, false));
+		}
+
+		context->scan = 
+			zedstoream_beginscan_with_column_projection(onerel, NULL, 0, NULL,
+														NULL, SO_TYPE_ANALYZE,
+														project_columns);
+	}
+
+	/* zedstore use a logical block number to acquire sample rows */
+	tid = zsbt_get_last_tid(onerel);
+	context->totalblocks = ZSTidGetBlockNumber(tid) + 1;
+}
+
 /*
- * FIXME: The ANALYZE API is problematic for us. acquire_sample_rows() calls
- * RelationGetNumberOfBlocks() directly on the relation, and chooses the
- * block numbers to sample based on that. But the logical block numbers
- * have little to do with physical ones in zedstore.
+ * Get next logical block.
  */
 static bool
-zedstoream_scan_analyze_next_block(TableScanDesc sscan, BlockNumber blockno,
-								   BufferAccessStrategy bstrategy)
+zedstoream_scan_analyze_next_block(BlockNumber blockno,
+								   AnalyzeSampleContext *context)
 {
-	return zs_blkscan_next_block(sscan, blockno, NULL, -1, false);
+	return zs_blkscan_next_block(context->scan, blockno, NULL, -1, false);
 }
 
 static bool
-zedstoream_scan_analyze_next_tuple(TableScanDesc sscan, TransactionId OldestXmin,
-								   double *liverows, double *deadrows,
-								   TupleTableSlot *slot)
+zedstoream_scan_analyze_next_tuple(TransactionId OldestXmin, AnalyzeSampleContext *context)
 {
-	bool		result;
+	int		i;
+	bool	result;
+	AttrNumber		attno;
+	TableScanDesc	scan = context->scan;
+	ZedStoreDesc	sscan = (ZedStoreDesc) scan;
+	ZSAttrTreeScan	*attr_scan;
+	TupleTableSlot	*slot = AnalyzeGetSampleSlot(context, scan->rs_rd, ANALYZE_SAMPLE_DATA);
 
-	result = zs_blkscan_next_tuple(sscan, slot);
+	result = zs_blkscan_next_tuple(scan, slot);
 
 	if (result)
-		(*liverows)++;
+	{
+		/* provide extra disk info when analyzing on full columns */
+		if (!context->anl_cols)
+		{
+			slot = AnalyzeGetSampleSlot(context, scan->rs_rd, ANALYZE_SAMPLE_DISKSIZE);
+
+			for (i = 1; i < sscan->proj_data.num_proj_atts; i++)
+			{
+				attr_scan = &sscan->proj_data.attr_scans[i - 1];	
+				attno = sscan->proj_data.proj_atts[i];
+
+				slot->tts_values[attno - 1] =
+					Float8GetDatum(attr_scan->decoder.avg_elements_size); 
+				slot->tts_isnull[attno - 1] = false;
+				slot->tts_flags &= ~TTS_FLAG_EMPTY;
+			}
+		}
+
+		context->liverows++;
+	}
 
 	return result;
 }
 
+static void
+zedstoream_scan_analyze_sample_tuple(int pos, bool replace, AnalyzeSampleContext *context)
+{
+	TupleTableSlot *slot;
+	Relation onerel = context->scan->rs_rd;
+
+	slot = AnalyzeGetSampleSlot(context, onerel, ANALYZE_SAMPLE_DATA);
+	AnalyzeRecordSampleRow(context, slot, NULL, ANALYZE_SAMPLE_DATA, pos, replace, false);
+
+	/* only record */
+	if (!context->anl_cols)
+	{
+		slot = AnalyzeGetSampleSlot(context, onerel, ANALYZE_SAMPLE_DISKSIZE);
+		AnalyzeRecordSampleRow(context, slot, NULL, ANALYZE_SAMPLE_DISKSIZE, pos, replace, false);
+	}
+}
+
+static void
+zedstoream_scan_analyze_endscan(AnalyzeSampleContext *context)
+{
+	table_endscan(context->scan);
+}
+
 /* ------------------------------------------------------------------------
  * Miscellaneous callbacks for the heap AM
  * ------------------------------------------------------------------------
@@ -2713,6 +2790,18 @@ zs_blkscan_next_tuple(TableScanDesc sscan, TupleTableSlot *slot)
 
 	if (scan->bmscan_nexttuple >= scan->bmscan_ntuples)
 		return false;
+
+	/*
+	 * Initialize the slot.
+	 *
+	 * We initialize all columns to NULL. The values for columns that are projected
+	 * will be set to the actual values below, but it's important that non-projected
+	 * columns are NULL.
+	 */
+	ExecClearTuple(slot);
+	for (int i = 0; i < sscan->rs_rd->rd_att->natts; i++)
+		slot->tts_isnull[i] = true;
+
 	/*
 	 * projection attributes were created based on Relation tuple descriptor
 	 * it better match TupleTableSlot.
@@ -2935,8 +3024,11 @@ static const TableAmRoutine zedstoream_methods = {
 	.relation_copy_data = zedstoream_relation_copy_data,
 	.relation_copy_for_cluster = zedstoream_relation_copy_for_cluster,
 	.relation_vacuum = zedstoream_vacuum_rel,
+	.scan_analyze_beginscan = zedstoream_scan_analyze_beginscan,
 	.scan_analyze_next_block = zedstoream_scan_analyze_next_block,
 	.scan_analyze_next_tuple = zedstoream_scan_analyze_next_tuple,
+	.scan_analyze_sample_tuple = zedstoream_scan_analyze_sample_tuple,
+	.scan_analyze_endscan = zedstoream_scan_analyze_endscan,
 
 	.index_build_range_scan = zedstoream_index_build_range_scan,
 	.index_validate_scan = zedstoream_index_validate_scan,
diff --git a/src/include/access/zedstore_internal.h b/src/include/access/zedstore_internal.h
index 21ea504..58227bd 100644
--- a/src/include/access/zedstore_internal.h
+++ b/src/include/access/zedstore_internal.h
@@ -78,6 +78,9 @@ typedef struct
 	char	   *chunks_buf;
 	int			chunks_buf_size;
 
+	/* attstream compression ratio */
+	float8		compression_ratio;
+
 	/* information about the current attstream in the buffer */
 	int			chunks_len;
 	zstid		firsttid;
@@ -96,6 +99,7 @@ typedef struct
 	Datum		datums[DECODER_MAX_ELEMS];
 	bool		isnulls[DECODER_MAX_ELEMS];
 	int			num_elements;
+	float8		avg_elements_size; /* avg physical size of elements */
 } attstream_decoder;
 
 /*
-- 
1.8.3.1

Re: Extracting only the columns needed for a query

Reply via email to