Github user Librago commented on a diff in the pull request:

    https://github.com/apache/incubator-hawq/pull/1384#discussion_r208448574
  
    --- Diff: src/backend/commands/analyze.c ---
    @@ -3266,3 +3298,380 @@ static void 
gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, floa
        pfree(fstotal);
        return;
     }
    +
    +/**
    + * This method estimates the number of tuples and pages in an extern 
relation. We can not get accurate tuple counts
    + * and pages counts in the catalog. Therefore, we have to get reltuples 
and relpages manually.
    + *
    + * Input:
    + *         rel - Relation. Must be an external table.
    + *
    + * Output:
    + *         reltuples - exact number of tuples in relation.
    + *         relpages  - exact number of pages.
    + */
    +static void gp_statistics_estimate_reltuples_relpages_external(Relation 
rel, float4 *relTuples, float4 *relPages){
    +   Oid extRelationOid = RelationGetRelid(rel);
    +   getExternalRelTuples(extRelationOid, relTuples);
    +   getExternalRelPages(extRelationOid, relPages, rel);
    +}
    +
    +/**
    + * This method called by analyzeExternalEstimateReltuplesRelpages,
    + * to get External Relation reltuple counts, we run count(*) sql manually
    + *
    + * Input:
    + *         extRelationOid - External Table Relation Oid
    + * Output:
    + *         relTuples - exact number of tuples in relation.
    + */
    +static void getExternalRelTuples(Oid extRelationOid, float4 *relTuples){
    +   const char *schemaName = NULL;
    +   const char *tableName = NULL;
    +   schemaName = get_namespace_name(get_rel_namespace(extRelationOid)); /* 
must be pfreed */
    +   tableName = get_rel_name(extRelationOid); /* must be pfreed */
    +
    +   StringInfoData str;
    +   initStringInfo(&str);
    +   appendStringInfo(&str, "select count(*)::float4 from %s.%s as Ta",
    +                   quote_identifier(schemaName),
    +                   quote_identifier(tableName));
    +
    +   spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */,
    +                                                           
spiCallback_getSingleResultRowColumnAsFloat4, relTuples);
    +   pfree((void *) tableName);
    +   pfree((void *) schemaName);
    +   pfree(str.data);
    +}
    +
    +/**
    + * This method called by analyzeExternalEstimateReltuplesRelpages,to get 
External Relation relpages counts.
    + * We call GetExtTableEntry method to get get List of external Table 
Locations.And then we go through every
    + * location url to sum the count of relpages.External Relation now support 
some different protocals, therefore
    + * we need to process them in different way.
    + *
    + * Input:
    + *         extRelationOid - External Table Relation Oid
    + * Output:
    + *         relTuples - exact number of pages in relation.
    + */
    +static void getExternalRelPages(Oid extRelationOid, float4 *relPages , 
Relation rel){
    +
    +   ExtTableEntry* entry = GetExtTableEntry(extRelationOid);
    +   List* extLocations = entry->locations;
    +   int     num_urls = list_length(extLocations);
    +   ListCell *cell = list_head(extLocations);
    +   ListCell *cellTmp = NULL;
    +   while(cell != NULL)
    +   {
    +           char *url = pstrdup(((Value*)lfirst(cell))->val.str);
    +           Assert(url != NULL);
    +           Uri *uri = ParseExternalTableUri(url);
    +           Assert(uri != NULL);
    +           switch (uri->protocol){
    +                   case URI_HDFS:
    +                           *relPages += getExtrelPagesHDFS(uri);
    +                           break;
    +
    +                   /*
    +                    * to be done
    +                    */
    +                   case URI_GPFDIST:
    +                *relPages = 1.0;
    +                elog(NOTICE,"In external table ANALYZE command are not 
supported in GPFDIST location so far.");
    +                break;
    +            case URI_FILE:
    +                *relPages = 1.0;
    +                elog(NOTICE,"In external table ANALYZE command are not 
supported in FILE location so far.");
    +                break;
    +                   case URI_FTP:
    +                           *relPages = 1.0;
    +                           elog(NOTICE,"In external table ANALYZE command 
are not supported in FTP location so far.");
    +                break;
    +                   case URI_HTTP:
    +                           *relPages = 1.0;
    +                           elog(NOTICE,"In external table ANALYZE command 
are not supported in HTTP location so far.");
    +                break;
    +                   case URI_CUSTOM:
    +                           *relPages = 1.0;
    +                           elog(NOTICE,"In external table ANALYZE command 
are not supported in CUSTOM location so far.");
    +                break;
    +                   case URI_GPFDISTS:
    +                           *relPages = 1.0;
    +                           elog(NOTICE,"In external table ANALYZE command 
are not supported in GPFDISTS location so far.");
    +                break;
    +                   default:
    +                           *relPages = 1.0;
    +                elog(NOTICE,"should not go here");
    +                           break;
    +           }
    +
    +           cell = cell->next;
    +
    +           /* free resourse */
    +           pfree(url);
    +           if(uri->customprotocol != NULL){ pfree(uri->customprotocol);}
    +           pfree(uri->hostname);
    +           if(uri->path!=NULL){pfree(uri->path);}
    +           pfree(uri);
    +   }
    +   /* pfree entry->location*/
    +   list_free_deep(extLocations);
    +   /* pfree entry */
    +   if(entry->fmtopts != NULL){ pfree(entry->fmtopts);}
    +   if(entry->command != NULL){ pfree(entry->command);}
    +   pfree(entry);
    +}
    +
    +/**
    + * This method get the number of pages external table which location uri 
protocol is HDFS. We hold that
    + * the concept of the page number in  external table is same as the 
concept of block number in hdfs.
    + * Therefore we get the number of pages for external table by get the 
number of blocks in hdfs
    + *
    + * Input:
    + *         uri - hdfs uri which refers to external table storage location, 
uri can refer to a file or a folder
    + *
    + */
    +static float4 getExtrelPagesHDFS(Uri *uri){
    +   int numOfBlock = 0;
    +   int nsize = 0;
    +   float4 relpages = 0.0;
    +   hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
    +
    +   //hdfsFileInfo *fiarray = hdfsGetPathInfo(fs, uri->path);
    +   hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize);
    +   if (fs == NULL)
    +   {
    +           elog(ERROR, "hdfsprotocol_blocklocation : "
    +                                   "failed to get files of path %s",
    +                                   uri->path);
    +   }
    +
    +   /* Call block location api to get data location for each file */
    +   for (int i = 0 ; i < nsize ; i++)
    +   {
    +//         FscHdfsFileInfoC *fi = FscHdfsGetFileInfoFromArray(fiarray, i);
    +           hdfsFileInfo *fi = &fiarray[i];
    +           /* break condition of this for loop */
    +           if (fi == NULL) {break;}
    +
    +           /* Build file name full path. */
    +           const char *fname = fi->mName;
    +           char *fullpath = palloc0(
    +                                                            strlen(fname) 
+      /* name  */
    +                                                            1);            
      /* \0    */
    +           sprintf(fullpath, "%s/%s", uri->path, fname);
    +
    +           /* Get file full length. */
    +   //      int64_t len = FscHdfsGetFileInfoLength(fi);
    +           int64_t len = fi->mSize;
    +           if (len == 0) {
    +                   pfree(fullpath);
    +                   continue;
    +           }
    +
    +           /* Get block location data for this file */
    +           BlockLocation *bla = hdfsGetFileBlockLocations(fs, fullpath, 0, 
len,&numOfBlock);
    +           if (bla == NULL)
    +           {
    +                   elog(ERROR, "hdfsprotocol_blocklocation : "
    +                                           "failed to get block location 
of path %s. "
    +                                           "It is reported generally due 
to HDFS service errors or "
    +                                           "another session's ongoing 
writing.",
    +                                           fullpath);
    +           }
    +
    +           relpages += numOfBlock;
    +
    +
    +           /* We don't need it any longer */
    +           pfree(fullpath);
    +
    +           /* Clean up block location instances created by the lib. */
    +           hdfsFreeFileBlockLocations(&bla,numOfBlock);
    +   }
    +
    +   /* Clean up file info array created by the lib for this location. */
    +// FscHdfsFreeFileInfoArrayC(&fiarray);
    +   hdfsFreeFileInfo(fiarray,nsize);
    +   hdfsDisconnect(fs);
    +   return relpages;
    +}
    +
    +
    +/*
    + * Get total bytes of external table with HDFS protocol
    + */
    +uint64 GetExternalTotalBytesHDFS(Uri *uri)
    +{
    +   uint64 totalBytes = 0;
    +   int nsize = 0;
    +
    +   hdfsFS fs = hdfsConnect(uri->hostname, uri->port);
    +
    +   hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize);
    +   if (fiarray == NULL)
    +   {
    +           elog(ERROR, "hdfsprotocol_blocklocation : "
    +                       "failed to get files of path %s.",
    +                       uri->path);
    +   }
    +
    +   /* Call block location api to get data location for each file */
    +   for (int i = 0 ; i < nsize ; i++)
    +   {
    +           hdfsFileInfo *fi = &fiarray[i];
    +
    +           /* Break condition of this for loop */
    +           if (fi == NULL)
    +           {
    +                   break;
    +           }
    +
    +           /* Get file full length. */
    +           totalBytes += fi->mSize;
    +
    +   }
    +
    +   /* Clean up file info array created by the lib for this location. */
    +   hdfsFreeFileInfo(fiarray,nsize);
    +           hdfsDisconnect(fs);
    +
    +   return totalBytes;
    +}
    +
    +/*
    + * Get total bytes of external table
    + */
    +uint64 GetExternalTotalBytes(Relation rel)
    +{
    +   Oid extRelOid = RelationGetRelid(rel);
    +   ExtTableEntry *entry = GetExtTableEntry(extRelOid);
    +   List *extLocations = entry->locations;
    +   int num_urls = list_length(extLocations);
    +   ListCell *cell = list_head(extLocations);
    +   ListCell *cellTmp = NULL;
    +   uint64 totalBytes = 0;
    +
    +   while(cell != NULL)
    +   {
    +           char *url = pstrdup(((Value*)lfirst(cell))->val.str);
    +           Assert(url != NULL);
    +
    +           Uri *uri = ParseExternalTableUri(url);
    +           Assert(uri != NULL);
    +
    +           switch (uri->protocol)
    +           {
    +           case URI_HDFS:
    +                   totalBytes += GetExternalTotalBytesHDFS(uri);
    +                   break;
    +           /*
    +            * Support analyze for external table.
    +            * For now, HDFS protocol external table is supported.
    +            */
    +           case URI_GPFDIST:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in GPFDIST location so far.");
    +                   break;
    +
    +           case URI_FILE:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in FILE location so far.");
    +                   break;
    +
    +           case URI_FTP:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in FTP location so far.");
    +                   break;
    +
    +           case URI_HTTP:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in HTTP location so far.");
    +                   break;
    +
    +           case URI_CUSTOM:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in CUSTOM location so far.");
    +                   break;
    +
    +           case URI_GPFDISTS:
    +                   totalBytes += 0;
    +                   elog(ERROR,"In external table ANALYZE command are not 
supported in GPFDISTS location so far.");
    +                   break;
    +
    +           default:
    +                   totalBytes += 0;
    +                   elog(ERROR,"should not go here");
    +                   break;
    +           }
    +
    +           cell = cell->next;
    +
    +           /* free resourse */
    +           pfree(url);
    +           if (uri->customprotocol != NULL)
    +           {
    +                   pfree(uri->customprotocol);
    +           }
    +           pfree(uri->hostname);
    +
    +           if (uri->path != NULL)
    +           {
    +                   pfree(uri->path);
    +           }
    +           pfree(uri);
    +
    +   }
    +   /* pfree entry->location*/
    +   list_free_deep(extLocations);
    +   /* pfree entry */
    +   if (entry->fmtopts != NULL)
    +   {
    +           pfree(entry->fmtopts);
    +   }
    +   if (entry->command != NULL)
    +   {
    +           pfree(entry->command);
    +   }
    +   pfree(entry);
    +
    +   return totalBytes;
    +}
    +
    +/*
    + * Check if a relation is external table with HDFS protocol
    + */
    +static bool isExternalHDFSORMAGMAProtocol(Oid relOid)
    +{
    +   bool ret = true;
    +
    +   Relation rel = try_relation_open(relOid, AccessShareLock, false);
    +   if (rel != NULL)
    +   {
    +           if ((rel->rd_rel->relkind == RELKIND_RELATION) &&
    +               RelationIsExternal(rel))
    +           {
    +                   ExtTableEntry* entry = GetExtTableEntry(relOid);
    +                   List* extLocations = entry->locations;
    +                   ListCell *cell = list_head(extLocations);
    +                   while(cell != NULL)
    +                   {
    +                           char *url = ((Value*)lfirst(cell))->val.str;
    +                           Assert(url != NULL);
    +                   //      if (!IS_HDFS_URI(url))
    +                           if (!IS_HDFS_URI(url) && !IS_MAGMA_URI(url))
    --- End diff --
    
    remove magma


---

Reply via email to