[ https://issues.apache.org/jira/browse/DRILL-5356?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15959866#comment-15959866 ]
ASF GitHub Bot commented on DRILL-5356: --------------------------------------- Github user ppadma commented on a diff in the pull request: https://github.com/apache/drill/pull/789#discussion_r110266898 --- Diff: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/BatchReader.java --- @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.parquet.columnreaders; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; + +/** + * Base strategy for reading a batch of Parquet records. + */ +public abstract class BatchReader { + + protected final ReadState readState; + + public BatchReader(ReadState readState) { + this.readState = readState; + } + + public int readBatch() throws Exception { + ColumnReader<?> firstColumnStatus = readState.getFirstColumnReader(); + long recordsToRead = Math.min(getReadCount(firstColumnStatus), readState.getRecordsToRead()); + int readCount = readRecords(firstColumnStatus, recordsToRead); + readState.fillNullVectors(readCount); + return readCount; + } + + protected abstract long getReadCount(ColumnReader<?> firstColumnStatus); + + protected abstract int readRecords(ColumnReader<?> firstColumnStatus, long recordsToRead) throws Exception; + + protected void readAllFixedFields(long recordsToRead) throws Exception { + Stopwatch timer = Stopwatch.createStarted(); + if(readState.useAsyncColReader()){ + readAllFixedFieldsParallel(recordsToRead); + } else { + readAllFixedFieldsSerial(recordsToRead); + } + readState.parquetReaderStats().timeFixedColumnRead.addAndGet(timer.elapsed(TimeUnit.NANOSECONDS)); + } + + protected void readAllFixedFieldsSerial(long recordsToRead) throws IOException { + for (ColumnReader<?> crs : readState.getColumnReaders()) { + crs.processPages(recordsToRead); + } + } + + protected void readAllFixedFieldsParallel(long recordsToRead) throws Exception { + ArrayList<Future<Long>> futures = Lists.newArrayList(); + for (ColumnReader<?> crs : readState.getColumnReaders()) { + Future<Long> f = crs.processPagesAsync(recordsToRead); + futures.add(f); + } + Exception exception = null; + for(Future<Long> f: futures){ + if (exception != null) { + f.cancel(true); + } else { + try { + f.get(); + } catch (Exception e) { + f.cancel(true); + exception = e; + } + } + } + if (exception != null) { + throw exception; + } + } + + /** + * Strategy for reading mock records. (What are these?) + */ --- End diff -- Please add brief explanation what these are instead of "what are these ?" > Refactor Parquet Record Reader > ------------------------------ > > Key: DRILL-5356 > URL: https://issues.apache.org/jira/browse/DRILL-5356 > Project: Apache Drill > Issue Type: Improvement > Affects Versions: 1.10.0, 1.11.0 > Reporter: Paul Rogers > Assignee: Paul Rogers > Priority: Minor > Fix For: 1.11.0 > > > The Parquet record reader class is a key part of Drill that has evolved over > time to become somewhat hard to follow. > A number of us are working on Parquet-related tasks and find we have to spend > an uncomfortable amount of time trying to understand the code. In particular, > this writer needs to figure out how to convince the reader to provide > higher-density record batches. > Rather than continue to decypher the complex code multiple times, this ticket > requests to refactor the code to make it functionally identical, but > structurally cleaner. The result will be faster time to value when working > with this code. > This is a lower-priority change and will be coordinated with others working > on this code base. This ticket is only for the record reader class itself; it > does not include the various readers and writers that Parquet uses since > another project is actively modifying those classes. -- This message was sent by Atlassian JIRA (v6.3.15#6346)