[ https://issues.apache.org/jira/browse/DRILL-4517?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16904670#comment-16904670 ]
ASF GitHub Bot commented on DRILL-4517: --------------------------------------- vvysotskyi commented on pull request #1839: DRILL-4517: Support reading empty Parquet files URL: https://github.com/apache/drill/pull/1839#discussion_r312739651 ########## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ReadState.java ########## @@ -69,69 +69,59 @@ private boolean useAsyncColReader; public ReadState(ParquetSchema schema, - RecordBatchSizerManager batchSizerMgr, ParquetReaderStats parquetReaderStats, long numRecordsToRead, - boolean useAsyncColReader) { - + RecordBatchSizerManager batchSizerMgr, + ParquetReaderStats parquetReaderStats, + long numRecordsToRead, + boolean useAsyncColReader) { this.schema = schema; this.batchSizerMgr = batchSizerMgr; this.parquetReaderStats = parquetReaderStats; this.useAsyncColReader = useAsyncColReader; - if (! schema.isStarQuery()) { - nullFilledVectors = new ArrayList<>(); - } - - // In the case where runtime pruning prunes out all the rowgroups, then just a single rowgroup - // with zero rows is read (in order to get the schema, no need for the rows) - if ( numRecordsToRead == 0 ) { - this.totalNumRecordsToRead = 0; - return; - } - - // Because of JIRA DRILL-6528, the Parquet reader is sometimes getting the wrong - // number of rows to read. For now, returning all a file data (till - // downstream operator stop consuming). - numRecordsToRead = -1; - - // Callers can pass -1 if they want to read all rows. - if (numRecordsToRead == ParquetRecordReader.NUM_RECORDS_TO_READ_NOT_SPECIFIED) { - this.totalNumRecordsToRead = schema.getGroupRecordCount(); - } else { - assert (numRecordsToRead >= 0); - this.totalNumRecordsToRead = Math.min(numRecordsToRead, schema.getGroupRecordCount()); + if (!schema.isStarQuery()) { + this.nullFilledVectors = new ArrayList<>(); } + this.totalNumRecordsToRead = numRecordsToRead; } /** * Create the readers needed to read columns: fixed-length or variable length. * - * @param reader - * @param output - * @throws Exception + * @param reader parquet record reader + * @param output output mutator */ - @SuppressWarnings("unchecked") public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception { - final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); - // initialize all of the column read status objects - BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata(); - Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata); - for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { - ColumnDescriptor column = columnMetadata.column; - columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get( - columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); - columnMetadata.buildVector(output); - if (! columnMetadata.isFixedLength( )) { - // create a reader and add it to the appropriate list - varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader)); - } else if (columnMetadata.isRepeated()) { - varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader)); + if (totalNumRecordsToRead == 0) { + // there is no need to spend resources to init readers, when schema will be output + for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { + columnMetadata.buildVector(output); } - else { - fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader)); + } else { + List<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>(); + // initialize all of the column read status objects + BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata(); + if (rowGroupMetadata != null) { + Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata); + for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) { + ColumnDescriptor column = columnMetadata.column; + columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get( + columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); + columnMetadata.buildVector(output); + if (!columnMetadata.isFixedLength()) { + // create a reader and add it to the appropriate list + varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader)); + } else if (columnMetadata.isRepeated()) { + varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader)); + } Review comment: ```suggestion } else { ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Reading emtpy Parquet file failes with java.lang.IllegalArgumentException > ------------------------------------------------------------------------- > > Key: DRILL-4517 > URL: https://issues.apache.org/jira/browse/DRILL-4517 > Project: Apache Drill > Issue Type: Improvement > Components: Server > Reporter: Tobias > Assignee: Arina Ielchiieva > Priority: Major > Labels: doc-impacting > Fix For: 1.17.0 > > Attachments: empty.parquet, no_rows.parquet > > > When querying a Parquet file that has a schema but no rows the Drill Server > will fail with the below > This looks similar to DRILL-3557 > {noformat} > {{ParquetMetaData{FileMetaData{schema: message TRANSACTION_REPORT { > required int64 MEMBER_ACCOUNT_ID; > required int64 TIMESTAMP_IN_HOUR; > optional int64 APPLICATION_ID; > } > , metadata: {}}}, blocks: []} > {noformat} > {noformat} > Caused by: java.lang.IllegalArgumentException: MinorFragmentId 0 has no read > entries assigned > at > com.google.common.base.Preconditions.checkArgument(Preconditions.java:92) > ~[guava-14.0.1.jar:na] > at > org.apache.drill.exec.store.parquet.ParquetGroupScan.getSpecificScan(ParquetGroupScan.java:707) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.store.parquet.ParquetGroupScan.getSpecificScan(ParquetGroupScan.java:105) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitGroupScan(Materializer.java:68) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitGroupScan(Materializer.java:35) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.physical.base.AbstractGroupScan.accept(AbstractGroupScan.java:60) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitOp(Materializer.java:102) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitOp(Materializer.java:35) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.physical.base.AbstractPhysicalVisitor.visitProject(AbstractPhysicalVisitor.java:77) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.physical.config.Project.accept(Project.java:51) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitStore(Materializer.java:82) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.Materializer.visitStore(Materializer.java:35) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.physical.base.AbstractPhysicalVisitor.visitScreen(AbstractPhysicalVisitor.java:195) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.physical.config.Screen.accept(Screen.java:97) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.SimpleParallelizer.generateWorkUnit(SimpleParallelizer.java:355) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.planner.fragment.SimpleParallelizer.getFragments(SimpleParallelizer.java:134) > ~[drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.work.foreman.Foreman.getQueryWorkUnit(Foreman.java:518) > [drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.work.foreman.Foreman.runPhysicalPlan(Foreman.java:405) > [drill-java-exec-1.5.0.jar:1.5.0] > at > org.apache.drill.exec.work.foreman.Foreman.runSQL(Foreman.java:926) > [drill-java-exec-1.5.0.jar:1.5.0] > {noformat} -- This message was sent by Atlassian JIRA (v7.6.14#76016)