[GitHub] [drill] vvysotskyi commented on a change in pull request #1839: DRILL-4517: Support reading empty Parquet files

GitBox Sun, 11 Aug 2019 07:46:30 -0700

vvysotskyi commented on a change in pull request #1839: DRILL-4517: Support 
reading empty Parquet files
URL: https://github.com/apache/drill/pull/1839#discussion_r312739651


 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ReadState.java
 ##########
 @@ -69,69 +69,59 @@
   private boolean useAsyncColReader;
 
   public ReadState(ParquetSchema schema,
-    RecordBatchSizerManager batchSizerMgr, ParquetReaderStats 
parquetReaderStats, long numRecordsToRead,
-    boolean useAsyncColReader) {
-
+                   RecordBatchSizerManager batchSizerMgr,
+                   ParquetReaderStats parquetReaderStats,
+                   long numRecordsToRead,
+                   boolean useAsyncColReader) {
     this.schema = schema;
     this.batchSizerMgr = batchSizerMgr;
     this.parquetReaderStats = parquetReaderStats;
     this.useAsyncColReader = useAsyncColReader;
-    if (! schema.isStarQuery()) {
-      nullFilledVectors = new ArrayList<>();
-    }
-
-    // In the case where runtime pruning prunes out all the rowgroups, then 
just a single rowgroup
-    // with zero rows is read (in order to get the schema, no need for the 
rows)
-    if ( numRecordsToRead == 0 ) {
-      this.totalNumRecordsToRead = 0;
-      return;
-    }
-
-    // Because of JIRA DRILL-6528, the Parquet reader is sometimes getting the 
wrong
-    // number of rows to read. For now, returning all a file data (till
-    // downstream operator stop consuming).
-    numRecordsToRead = -1;
-
-    // Callers can pass -1 if they want to read all rows.
-    if (numRecordsToRead == 
ParquetRecordReader.NUM_RECORDS_TO_READ_NOT_SPECIFIED) {
-      this.totalNumRecordsToRead = schema.getGroupRecordCount();
-    } else {
-      assert (numRecordsToRead >= 0);
-      this.totalNumRecordsToRead = Math.min(numRecordsToRead, 
schema.getGroupRecordCount());
+    if (!schema.isStarQuery()) {
+      this.nullFilledVectors = new ArrayList<>();
     }
+    this.totalNumRecordsToRead = numRecordsToRead;
   }
 
   /**
    * Create the readers needed to read columns: fixed-length or variable 
length.
    *
-   * @param reader
-   * @param output
-   * @throws Exception
+   * @param reader parquet record reader
+   * @param output output mutator
    */
-
   @SuppressWarnings("unchecked")
   public void buildReader(ParquetRecordReader reader, OutputMutator output) 
throws Exception {
-    final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = 
new ArrayList<>();
-    // initialize all of the column read status objects
-    BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
-    Map<String, Integer> columnChunkMetadataPositionsInList = 
schema.buildChunkMap(rowGroupMetadata);
-    for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
-      ColumnDescriptor column = columnMetadata.column;
-      columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(
-                      
columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
-      columnMetadata.buildVector(output);
-      if (! columnMetadata.isFixedLength( )) {
-        // create a reader and add it to the appropriate list
-        varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
-      } else if (columnMetadata.isRepeated()) {
-        
varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
+    if (totalNumRecordsToRead == 0) {
+      // there is no need to spend resources to init readers, when schema will 
be output
+      for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
+        columnMetadata.buildVector(output);
       }
-      else {
-        fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader));
+    } else {
+      List<VarLengthColumn<? extends ValueVector>> varLengthColumns = new 
ArrayList<>();
+      // initialize all of the column read status objects
+      BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
+      if (rowGroupMetadata != null) {
+        Map<String, Integer> columnChunkMetadataPositionsInList = 
schema.buildChunkMap(rowGroupMetadata);
+        for (ParquetColumnMetadata columnMetadata : 
schema.getColumnMetadata()) {
+          ColumnDescriptor column = columnMetadata.column;
+          columnMetadata.columnChunkMetaData = 
rowGroupMetadata.getColumns().get(
+            
columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
+          columnMetadata.buildVector(output);
+          if (!columnMetadata.isFixedLength()) {
+            // create a reader and add it to the appropriate list
+            
varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
+          } else if (columnMetadata.isRepeated()) {
+            
varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
+          }
 
 Review comment:
   ```suggestion
             } else {
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [drill] vvysotskyi commented on a change in pull request #1839: DRILL-4517: Support reading empty Parquet files

Reply via email to