Github user Librago commented on a diff in the pull request:
https://github.com/apache/incubator-hawq/pull/1384#discussion_r208448574
--- Diff: src/backend/commands/analyze.c ---
@@ -3266,3 +3298,380 @@ static void
gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, floa
pfree(fstotal);
return;
}
+
+/**
+ * This method estimates the number of tuples and pages in an extern
relation. We can not get accurate tuple counts
+ * and pages counts in the catalog. Therefore, we have to get reltuples
and relpages manually.
+ *
+ * Input:
+ * rel - Relation. Must be an external table.
+ *
+ * Output:
+ * reltuples - exact number of tuples in relation.
+ * relpages - exact number of pages.
+ */
+static void gp_statistics_estimate_reltuples_relpages_external(Relation
rel, float4 *relTuples, float4 *relPages){
+ Oid extRelationOid = RelationGetRelid(rel);
+ getExternalRelTuples(extRelationOid, relTuples);
+ getExternalRelPages(extRelationOid, relPages, rel);
+}
+
+/**
+ * This method called by analyzeExternalEstimateReltuplesRelpages,
+ * to get External Relation reltuple counts, we run count(*) sql manually
+ *
+ * Input:
+ * extRelationOid - External Table Relation Oid
+ * Output:
+ * relTuples - exact number of tuples in relation.
+ */
+static void getExternalRelTuples(Oid extRelationOid, float4 *relTuples){
+ const char *schemaName = NULL;
+ const char *tableName = NULL;
+ schemaName = get_namespace_name(get_rel_namespace(extRelationOid)); /*
must be pfreed */
+ tableName = get_rel_name(extRelationOid); /* must be pfreed */
+
+ StringInfoData str;
+ initStringInfo(&str);
+ appendStringInfo(&str, "select count(*)::float4 from %s.%s as Ta",
+ quote_identifier(schemaName),
+ quote_identifier(tableName));
+
+ spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */,
+
spiCallback_getSingleResultRowColumnAsFloat4, relTuples);
+ pfree((void *) tableName);
+ pfree((void *) schemaName);
+ pfree(str.data);
+}
+
+/**
+ * This method called by analyzeExternalEstimateReltuplesRelpages,to get
External Relation relpages counts.
+ * We call GetExtTableEntry method to get get List of external Table
Locations.And then we go through every
+ * location url to sum the count of relpages.External Relation now support
some different protocals, therefore
+ * we need to process them in different way.
+ *
+ * Input:
+ * extRelationOid - External Table Relation Oid
+ * Output:
+ * relTuples - exact number of pages in relation.
+ */
+static void getExternalRelPages(Oid extRelationOid, float4 *relPages ,
Relation rel){
+
+ ExtTableEntry* entry = GetExtTableEntry(extRelationOid);
+ List* extLocations = entry->locations;
+ int num_urls = list_length(extLocations);
+ ListCell *cell = list_head(extLocations);
+ ListCell *cellTmp = NULL;
+ while(cell != NULL)
+ {
+ char *url = pstrdup(((Value*)lfirst(cell))->val.str);
+ Assert(url != NULL);
+ Uri *uri = ParseExternalTableUri(url);
+ Assert(uri != NULL);
+ switch (uri->protocol){
+ case URI_HDFS:
+ *relPages += getExtrelPagesHDFS(uri);
+ break;
+
+ /*
+ * to be done
+ */
+ case URI_GPFDIST:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command are not
supported in GPFDIST location so far.");
+ break;
+ case URI_FILE:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command are not
supported in FILE location so far.");
+ break;
+ case URI_FTP:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command
are not supported in FTP location so far.");
+ break;
+ case URI_HTTP:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command
are not supported in HTTP location so far.");
+ break;
+ case URI_CUSTOM:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command
are not supported in CUSTOM location so far.");
+ break;
+ case URI_GPFDISTS:
+ *relPages = 1.0;
+ elog(NOTICE,"In external table ANALYZE command
are not supported in GPFDISTS location so far.");
+ break;
+ default:
+ *relPages = 1.0;
+ elog(NOTICE,"should not go here");
+ break;
+ }
+
+ cell = cell->next;
+
+ /* free resourse */
+ pfree(url);
+ if(uri->customprotocol != NULL){ pfree(uri->customprotocol);}
+ pfree(uri->hostname);
+ if(uri->path!=NULL){pfree(uri->path);}
+ pfree(uri);
+ }
+ /* pfree entry->location*/
+ list_free_deep(extLocations);
+ /* pfree entry */
+ if(entry->fmtopts != NULL){ pfree(entry->fmtopts);}
+ if(entry->command != NULL){ pfree(entry->command);}
+ pfree(entry);
+}
+
+/**
+ * This method get the number of pages external table which location uri
protocol is HDFS. We hold that
+ * the concept of the page number in external table is same as the
concept of block number in hdfs.
+ * Therefore we get the number of pages for external table by get the
number of blocks in hdfs
+ *
+ * Input:
+ * uri - hdfs uri which refers to external table storage location,
uri can refer to a file or a folder
+ *
+ */
+static float4 getExtrelPagesHDFS(Uri *uri){
+ int numOfBlock = 0;
+ int nsize = 0;
+ float4 relpages = 0.0;
+ hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
+
+ //hdfsFileInfo *fiarray = hdfsGetPathInfo(fs, uri->path);
+ hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize);
+ if (fs == NULL)
+ {
+ elog(ERROR, "hdfsprotocol_blocklocation : "
+ "failed to get files of path %s",
+ uri->path);
+ }
+
+ /* Call block location api to get data location for each file */
+ for (int i = 0 ; i < nsize ; i++)
+ {
+// FscHdfsFileInfoC *fi = FscHdfsGetFileInfoFromArray(fiarray, i);
+ hdfsFileInfo *fi = &fiarray[i];
+ /* break condition of this for loop */
+ if (fi == NULL) {break;}
+
+ /* Build file name full path. */
+ const char *fname = fi->mName;
+ char *fullpath = palloc0(
+ strlen(fname)
+ /* name */
+ 1);
/* \0 */
+ sprintf(fullpath, "%s/%s", uri->path, fname);
+
+ /* Get file full length. */
+ // int64_t len = FscHdfsGetFileInfoLength(fi);
+ int64_t len = fi->mSize;
+ if (len == 0) {
+ pfree(fullpath);
+ continue;
+ }
+
+ /* Get block location data for this file */
+ BlockLocation *bla = hdfsGetFileBlockLocations(fs, fullpath, 0,
len,&numOfBlock);
+ if (bla == NULL)
+ {
+ elog(ERROR, "hdfsprotocol_blocklocation : "
+ "failed to get block location
of path %s. "
+ "It is reported generally due
to HDFS service errors or "
+ "another session's ongoing
writing.",
+ fullpath);
+ }
+
+ relpages += numOfBlock;
+
+
+ /* We don't need it any longer */
+ pfree(fullpath);
+
+ /* Clean up block location instances created by the lib. */
+ hdfsFreeFileBlockLocations(&bla,numOfBlock);
+ }
+
+ /* Clean up file info array created by the lib for this location. */
+// FscHdfsFreeFileInfoArrayC(&fiarray);
+ hdfsFreeFileInfo(fiarray,nsize);
+ hdfsDisconnect(fs);
+ return relpages;
+}
+
+
+/*
+ * Get total bytes of external table with HDFS protocol
+ */
+uint64 GetExternalTotalBytesHDFS(Uri *uri)
+{
+ uint64 totalBytes = 0;
+ int nsize = 0;
+
+ hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
+
+ hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize);
+ if (fiarray == NULL)
+ {
+ elog(ERROR, "hdfsprotocol_blocklocation : "
+ "failed to get files of path %s.",
+ uri->path);
+ }
+
+ /* Call block location api to get data location for each file */
+ for (int i = 0 ; i < nsize ; i++)
+ {
+ hdfsFileInfo *fi = &fiarray[i];
+
+ /* Break condition of this for loop */
+ if (fi == NULL)
+ {
+ break;
+ }
+
+ /* Get file full length. */
+ totalBytes += fi->mSize;
+
+ }
+
+ /* Clean up file info array created by the lib for this location. */
+ hdfsFreeFileInfo(fiarray,nsize);
+ hdfsDisconnect(fs);
+
+ return totalBytes;
+}
+
+/*
+ * Get total bytes of external table
+ */
+uint64 GetExternalTotalBytes(Relation rel)
+{
+ Oid extRelOid = RelationGetRelid(rel);
+ ExtTableEntry *entry = GetExtTableEntry(extRelOid);
+ List *extLocations = entry->locations;
+ int num_urls = list_length(extLocations);
+ ListCell *cell = list_head(extLocations);
+ ListCell *cellTmp = NULL;
+ uint64 totalBytes = 0;
+
+ while(cell != NULL)
+ {
+ char *url = pstrdup(((Value*)lfirst(cell))->val.str);
+ Assert(url != NULL);
+
+ Uri *uri = ParseExternalTableUri(url);
+ Assert(uri != NULL);
+
+ switch (uri->protocol)
+ {
+ case URI_HDFS:
+ totalBytes += GetExternalTotalBytesHDFS(uri);
+ break;
+ /*
+ * Support analyze for external table.
+ * For now, HDFS protocol external table is supported.
+ */
+ case URI_GPFDIST:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in GPFDIST location so far.");
+ break;
+
+ case URI_FILE:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in FILE location so far.");
+ break;
+
+ case URI_FTP:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in FTP location so far.");
+ break;
+
+ case URI_HTTP:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in HTTP location so far.");
+ break;
+
+ case URI_CUSTOM:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in CUSTOM location so far.");
+ break;
+
+ case URI_GPFDISTS:
+ totalBytes += 0;
+ elog(ERROR,"In external table ANALYZE command are not
supported in GPFDISTS location so far.");
+ break;
+
+ default:
+ totalBytes += 0;
+ elog(ERROR,"should not go here");
+ break;
+ }
+
+ cell = cell->next;
+
+ /* free resourse */
+ pfree(url);
+ if (uri->customprotocol != NULL)
+ {
+ pfree(uri->customprotocol);
+ }
+ pfree(uri->hostname);
+
+ if (uri->path != NULL)
+ {
+ pfree(uri->path);
+ }
+ pfree(uri);
+
+ }
+ /* pfree entry->location*/
+ list_free_deep(extLocations);
+ /* pfree entry */
+ if (entry->fmtopts != NULL)
+ {
+ pfree(entry->fmtopts);
+ }
+ if (entry->command != NULL)
+ {
+ pfree(entry->command);
+ }
+ pfree(entry);
+
+ return totalBytes;
+}
+
+/*
+ * Check if a relation is external table with HDFS protocol
+ */
+static bool isExternalHDFSORMAGMAProtocol(Oid relOid)
+{
+ bool ret = true;
+
+ Relation rel = try_relation_open(relOid, AccessShareLock, false);
+ if (rel != NULL)
+ {
+ if ((rel->rd_rel->relkind == RELKIND_RELATION) &&
+ RelationIsExternal(rel))
+ {
+ ExtTableEntry* entry = GetExtTableEntry(relOid);
+ List* extLocations = entry->locations;
+ ListCell *cell = list_head(extLocations);
+ while(cell != NULL)
+ {
+ char *url = ((Value*)lfirst(cell))->val.str;
+ Assert(url != NULL);
+ // if (!IS_HDFS_URI(url))
+ if (!IS_HDFS_URI(url) && !IS_MAGMA_URI(url))
--- End diff --
remove magma
---