[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16832897#comment-16832897 ] ASF GitHub Bot commented on DRILL-7062: --- sohami commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Labels: ready-to-commit > Fix For: 1.17.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16822279#comment-16822279 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r277116139 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexC
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16820648#comment-16820648 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276498080 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16820642#comment-16820642 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276496971 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java ## @@ -43,6 +44,7 @@ import org.apache.drill.exec.store.ColumnExplorer; import org.apache.drill.exec.store.dfs.FileSelection; import org.apache.drill.exec.store.parquet.FilterEvaluatorUtils; +// import org.apache.drill.exec.store.parquet.ParquetGroupScan; Review comment: Done, in a followup commit This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.17.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16820643#comment-16820643 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276497080 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16819612#comment-16819612 ] ASF GitHub Bot commented on DRILL-7062: --- rhou1 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276035958 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(f
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16819611#comment-16819611 ] ASF GitHub Bot commented on DRILL-7062: --- rhou1 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276035958 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(f
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16819604#comment-16819604 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276033309 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexC
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16819606#comment-16819606 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276033774 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/physical/base/AbstractGroupScanWithMetadata.java ## @@ -43,6 +44,7 @@ import org.apache.drill.exec.store.ColumnExplorer; import org.apache.drill.exec.store.dfs.FileSelection; import org.apache.drill.exec.store.parquet.FilterEvaluatorUtils; +// import org.apache.drill.exec.store.parquet.ParquetGroupScan; Review comment: Remove. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.17.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16819605#comment-16819605 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r276032653 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +84,144 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); - } +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats +TupleSchema tupleSchema = rowGroupScan.getTupleSchema(); + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + boolean doRuntimePruning = filterExpr != null && // was a filter given ? And it is not just a "TRUE" predicate +! ((filterExpr instanceof ValueExpressions.BooleanExpression) && ((ValueExpressions.BooleanExpression) filterExpr).getBoolean() ); - ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); - if (timer != null) { -long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); -logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V4.ParquetTableMetadata_v4 tableMetadataV4 = null; + Metadata_V4.ParquetFileAndRowCountMetadata fileMetadataV4 = null; + FileSelection fileSelection = null; + FilterPredicate filterPredicate = null; + Set schemaPathsInExpr = null; + Set columnsInExpr = null; + + // If pruning - Prepare the predicate and the columns before the FOR LOOP + if ( doRuntimePruning ) { +filterPredicate = AbstractGroupScanWithMetadata.getFilterPredicate(filterExpr, context, + (FunctionImplementationRegistry) context.getFunctionRegistry(), context.getOptions(), true, + true /* supports file implicit columns */, + tupleSchema); +// Extract only the relevant columns from the filter (sans implicit columns, if any) +schemaPathsInExpr = filterExpr.accept(new FilterEvaluatorUtils.FieldReferenceFinder(), null); +columnsInExpr = new HashSet<>(); +String partitionColumnLabel = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL).string_val; +for (SchemaPath path : schemaPathsInExpr) { + if (rowGroupScan.supportsFileImplicitColumns() && +path.toString().matches(partitionColumnLabel+"\\d+")) { +continue; // skip implicit columns like dir0, dir1 } - footers.put(rowGroup.getPath(), footer); -} -ParquetMetadata footer = footers.get(rowGroup.getPath()); - -ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, - rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); -logger.debug("Contains corrupt dates: {}.", containsCorruptDates); - -boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); -boolean containsComplexColumn = ParquetReaderUtility.containsComplexC
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16816534#comment-16816534 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r275007436 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java ## @@ -448,23 +476,23 @@ private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, /** * Get the metadata for a single file */ - private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, - final FileStatus file, final FileSystem fs, boolean allColumns, Set columnSet) throws IOException, InterruptedException { -final ParquetMetadata metadata; -final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI(); -final Configuration conf = new Configuration(fs.getConf()); -try { - metadata = processUserUgi.doAs((PrivilegedExceptionAction)() -> { -try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) { - return parquetFileReader.getFooter(); -} - }); -} catch(Exception e) { - logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", -file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e); - throw e; + public static ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, ParquetMetadata footer, Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.17.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16816066#comment-16816066 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on issue #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#issuecomment-482479842 The code has been modified, simplified and passes all the tests. The changes from the first PR: (a) Cancelled item (4) above (new ctor for ParquetGroupScan). (b) Pass the TupleSchema thru the plan API, and use it to construct the FilterPredicate outside the runtime FOR loop. (c) Reduction of any unneeded new object construction and other work inside the FOR loop. E.g., only allocate metadata for the "interesting" columns. (d) Undid most of the footer passing thru the Metadata API (item (8-b) above). (e) Solved the issue with the Hive Drill Native Scan, thus RTP is no longer disabled in that case (item (1) above) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.17.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812936#comment-16812936 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273302668 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns + * @param rowGroup + * @param fs + * @param footer + * @param readSchemaOnly - if true sets the number of rows to read to be zero + * @return + */ + private Map createReaderAndImplicitColumns(ExecutorFragmentContext context, + AbstractParquetRowGroupScan rowGroupScan, + OperatorContext oContext, + ColumnExplorer columnExplorer, + List readers, + List> implicitColumns, + Map mapWithMaxColumns, + RowGroupReadEntry rowGroup, + DrillFileSystem fs, + ParquetMetadata footer, + boolean readSchemaOnly) { +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, + rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); +logger.debug("Contains corrupt dates: {}.", containsCorruptDates); + +boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); +boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(footer, rowGroupScan.getColumns()); +logger.debug("PARQUET_NEW_RECORD_READER is {}. Complex columns {}.", useNewReader ? "enabled" : "disabled", +containsComplexColumn ? "found." : "not found."); +RecordReader reader; + +if (useNewReader || containsComplexColumn) { + reader = new DrillParquetReader(context, + footer, + rowGroup, + columnExplorer.getTableColumns(), + fs, + containsCorruptDates); +} else { + reader = new ParquetRecordReader(context, + rowGroup.getPath(), + rowGroup.getRowGroupIndex(), + rowGroup.getNumRecordsToRead(), // if readSchemaOnly - then set to zero rows to read (currently breaks the ScanBatch) Review comment: Commented this out, and added TODO comments. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812934#comment-16812934 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273302407 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE Review comment: Added a check for true - getBoolean() . This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill >
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812933#comment-16812933 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273302288 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812932#comment-16812932 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273302187 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns + * @param rowGroup + * @param fs + * @param footer + * @param readSchemaOnly - if true sets the number of rows to read to be zero + * @return Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812927#comment-16812927 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273300809 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java ## @@ -52,6 +52,7 @@ private final ParquetFormatPlugin formatPlugin; private final ParquetFormatConfig formatConfig; + private final Collection drillbitEndpoints; Review comment: With the latest changes, no need for a special constructor for the ParquetGroupScan. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812926#comment-16812926 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r273300586 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java ## @@ -127,9 +127,21 @@ public static void createMeta(FileSystem fs, Path path, ParquetReaderConfig read * * @return parquet table metadata */ - public static ParquetTableMetadata_v3 getParquetTableMetadata(FileSystem fs, String path, ParquetReaderConfig readerConfig) throws IOException { + public static ParquetTableMetadata_v3 getParquetTableMetadata(ParquetMetadata footer, FileSystem fs, String path, ParquetReaderConfig readerConfig) throws IOException { Metadata metadata = new Metadata(readerConfig); -return metadata.getParquetTableMetadata(path, fs); +return metadata.getParquetTableMetadata(path, fs, footer); + } + + /** + * When the footer is not yet available (it would be read) Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812148#comment-16812148 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272902214 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java ## @@ -727,6 +727,10 @@ private ExecConstants() { public static final String BOOTSTRAP_STORAGE_PLUGINS_FILE = "bootstrap-storage-plugins.json"; + public static final String SKIP_RUNTIME_ROWGROUP_PRUNING_KEY = "exec.storage.skip_runtime_rowgroup_pruning"; Review comment: The namespace 'exec.storage.parquet' is not yet used for any option, and adding "parquet_" to the option name makes it longer and more typo prone (unless you are French :-) . Any user who understands when/why to use this option probably knows that "rowgroups" means Parquet. Are we going to introduce several other new options under this namespace ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812141#comment-16812141 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272896651 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE Review comment: We do want to get the schema even for FALSE filter. Currently, what actually happens is that a FALSE filter in the logical plan gets converted to a LIMIT 0, so typically the scanner would see limit 0; however, if the FALSE filter was created during some intermediate stage of planning due to other transformations that occurred after the Limit_0 transformation, then it is possible that the runtime might see a FALSE filter. I haven't confirmed it but I believe it is possible. This is an a
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812142#comment-16812142 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272898010 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE + +int rowGroupIndex = rowGroup.getRowGroupIndex(); +long footerRowCount = footer.getBlocks().get(rowGroupIndex).getRowCount(); + +if ( timer != null ) { // restart the timer, if tracing + timer.reset(); + timer.start(); +} + +// When starting a new file, or at the first time - Initialize path specific metadata etc +if ( ! rowGroup.getPath().equals(prevRowGroupPath) ) { + // Get the table metadata (V3) + tableMetadataV3 = Metadata.getParquetTableMetadata(fo
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812140#comment-16812140 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272898686 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java ## @@ -52,6 +52,7 @@ private final ParquetFormatPlugin formatPlugin; private final ParquetFormatConfig formatConfig; + private final Collection drillbitEndpoints; Review comment: The runtime has access to the `ExecutorFragmentContext` which contains the `getBits()` method that returns the list of DrillBits. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812092#comment-16812092 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272881878 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE + +int rowGroupIndex = rowGroup.getRowGroupIndex(); +long footerRowCount = footer.getBlocks().get(rowGroupIndex).getRowCount(); + +if ( timer != null ) { // restart the timer, if tracing + timer.reset(); + timer.start(); +} + +// When starting a new file, or at the first time - Initialize path specific metadata etc +if ( ! rowGroup.getPath().equals(prevRowGroupPath) ) { + // Get the table metadata (V3) + tableMetadataV3 = Metadata.getParquetTableMetadata(footer,
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812083#comment-16812083 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272879356 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns + * @param rowGroup + * @param fs + * @param footer + * @param readSchemaOnly - if true sets the number of rows to read to be zero + * @return + */ + private Map createReaderAndImplicitColumns(ExecutorFragmentContext context, + AbstractParquetRowGroupScan rowGroupScan, + OperatorContext oContext, + ColumnExplorer columnExplorer, + List readers, + List> implicitColumns, + Map mapWithMaxColumns, + RowGroupReadEntry rowGroup, + DrillFileSystem fs, + ParquetMetadata footer, + boolean readSchemaOnly) { +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, + rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); +logger.debug("Contains corrupt dates: {}.", containsCorruptDates); + +boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); +boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(footer, rowGroupScan.getColumns()); +logger.debug("PARQUET_NEW_RECORD_READER is {}. Complex columns {}.", useNewReader ? "enabled" : "disabled", +containsComplexColumn ? "found." : "not found."); +RecordReader reader; + +if (useNewReader || containsComplexColumn) { + reader = new DrillParquetReader(context, + footer, + rowGroup, + columnExplorer.getTableColumns(), + fs, + containsCorruptDates); +} else { + reader = new ParquetRecordReader(context, + rowGroup.getPath(), + rowGroup.getRowGroupIndex(), + rowGroup.getNumRecordsToRead(), // if readSchemaOnly - then set to zero rows to read (currently breaks the ScanBatch) Review comment: This was a last minute change (before it used this flag to set the num-rows-to-read to zero). Soon (another Jira) we should retry implementing this option, so left that flag in the code as a reminder (maybe should be just a TODO comment) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812082#comment-16812082 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272878991 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE + +int rowGroupIndex = rowGroup.getRowGroupIndex(); +long footerRowCount = footer.getBlocks().get(rowGroupIndex).getRowCount(); + +if ( timer != null ) { // restart the timer, if tracing + timer.reset(); + timer.start(); +} + +// When starting a new file, or at the first time - Initialize path specific metadata etc +if ( ! rowGroup.getPath().equals(prevRowGroupPath) ) { + // Get the table metadata (V3) + tableMetadataV3 = Metadata.getParquetTableMetadata(footer,
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812081#comment-16812081 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272878723 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE Review comment: Can the planner ever create a FALSE filter expression ? It is equivalent to eliminating the Scan operator This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.a
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812079#comment-16812079 ] ASF GitHub Bot commented on DRILL-7062: --- Ben-Zvi commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272878398 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java ## @@ -52,6 +52,7 @@ private final ParquetFormatPlugin formatPlugin; private final ParquetFormatConfig formatConfig; + private final Collection drillbitEndpoints; Review comment: How can we get to the formatPlugin (or its context) at the runtime ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812048#comment-16812048 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272869882 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE + +int rowGroupIndex = rowGroup.getRowGroupIndex(); +long footerRowCount = footer.getBlocks().get(rowGroupIndex).getRowCount(); + +if ( timer != null ) { // restart the timer, if tracing + timer.reset(); + timer.start(); +} + +// When starting a new file, or at the first time - Initialize path specific metadata etc +if ( ! rowGroup.getPath().equals(prevRowGroupPath) ) { + // Get the table metadata (V3) + tableMetadataV3 = Metadata.getParquetTableMetadata(fo
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812040#comment-16812040 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272858014 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java ## @@ -448,23 +476,23 @@ private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, /** * Get the metadata for a single file */ - private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, - final FileStatus file, final FileSystem fs, boolean allColumns, Set columnSet) throws IOException, InterruptedException { -final ParquetMetadata metadata; -final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI(); -final Configuration conf = new Configuration(fs.getConf()); -try { - metadata = processUserUgi.doAs((PrivilegedExceptionAction)() -> { -try (ParquetFileReader parquetFileReader = ParquetFileReader.open(HadoopInputFile.fromStatus(file, conf), readerConfig.toReadOptions())) { - return parquetFileReader.getFooter(); -} - }); -} catch(Exception e) { - logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", -file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e); - throw e; + public static ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, ParquetMetadata footer, Review comment: A reminder that a similar change would need to be done for V4 once it is merged. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812041#comment-16812041 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272860264 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns + * @param rowGroup + * @param fs + * @param footer + * @param readSchemaOnly - if true sets the number of rows to read to be zero + * @return Review comment: Add a brief description of the return This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812046#comment-16812046 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272862031 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE + +int rowGroupIndex = rowGroup.getRowGroupIndex(); +long footerRowCount = footer.getBlocks().get(rowGroupIndex).getRowCount(); + +if ( timer != null ) { // restart the timer, if tracing + timer.reset(); + timer.start(); +} + +// When starting a new file, or at the first time - Initialize path specific metadata etc +if ( ! rowGroup.getPath().equals(prevRowGroupPath) ) { + // Get the table metadata (V3) + tableMetadataV3 = Metadata.getParquetTableMetadata(fo
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812043#comment-16812043 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272860190 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetGroupScan.java ## @@ -52,6 +52,7 @@ private final ParquetFormatPlugin formatPlugin; private final ParquetFormatConfig formatConfig; + private final Collection drillbitEndpoints; Review comment: This seems unrelated to this PR since one can get the same information from the formatPlugin's context. Any specific reason for this ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812039#comment-16812039 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272853239 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/ExecConstants.java ## @@ -727,6 +727,10 @@ private ExecConstants() { public static final String BOOTSTRAP_STORAGE_PLUGINS_FILE = "bootstrap-storage-plugins.json"; + public static final String SKIP_RUNTIME_ROWGROUP_PRUNING_KEY = "exec.storage.skip_runtime_rowgroup_pruning"; Review comment: Since row groups are Parquet specific, should we make the option under a 'exec.storage.parquet' namespace ? or name it something like exec.storage.parquet_skip_runtime_rowgroup_pruning. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812045#comment-16812045 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272868355 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns + * @param rowGroup + * @param fs + * @param footer + * @param readSchemaOnly - if true sets the number of rows to read to be zero + * @return + */ + private Map createReaderAndImplicitColumns(ExecutorFragmentContext context, + AbstractParquetRowGroupScan rowGroupScan, + OperatorContext oContext, + ColumnExplorer columnExplorer, + List readers, + List> implicitColumns, + Map mapWithMaxColumns, + RowGroupReadEntry rowGroup, + DrillFileSystem fs, + ParquetMetadata footer, + boolean readSchemaOnly) { +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, + rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates()); +logger.debug("Contains corrupt dates: {}.", containsCorruptDates); + +boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER); +boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(footer, rowGroupScan.getColumns()); +logger.debug("PARQUET_NEW_RECORD_READER is {}. Complex columns {}.", useNewReader ? "enabled" : "disabled", +containsComplexColumn ? "found." : "not found."); +RecordReader reader; + +if (useNewReader || containsComplexColumn) { + reader = new DrillParquetReader(context, + footer, + rowGroup, + columnExplorer.getTableColumns(), + fs, + containsCorruptDates); +} else { + reader = new ParquetRecordReader(context, + rowGroup.getPath(), + rowGroup.getRowGroupIndex(), + rowGroup.getNumRecordsToRead(), // if readSchemaOnly - then set to zero rows to read (currently breaks the ScanBatch) Review comment: The `readSchemaOnly` is not getting used, so why even pass it to this function ? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812044#comment-16812044 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272858430 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/metadata/Metadata.java ## @@ -127,9 +127,21 @@ public static void createMeta(FileSystem fs, Path path, ParquetReaderConfig read * * @return parquet table metadata */ - public static ParquetTableMetadata_v3 getParquetTableMetadata(FileSystem fs, String path, ParquetReaderConfig readerConfig) throws IOException { + public static ParquetTableMetadata_v3 getParquetTableMetadata(ParquetMetadata footer, FileSystem fs, String path, ParquetReaderConfig readerConfig) throws IOException { Metadata metadata = new Metadata(readerConfig); -return metadata.getParquetTableMetadata(path, fs); +return metadata.getParquetTableMetadata(path, fs, footer); + } + + /** + * When the footer is not yet available (it would be read) Review comment: The comment seems incomplete..it should mention something about what the `getParquetTableMetadata()` does and add the footer information as a side-note, not the main description. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812042#comment-16812042 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272861023 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -149,6 +219,77 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow return new ScanBatch(context, oContext, readers, implicitColumns); } + /** + * Create a reader and add it to the list of readers. + * + * @param context + * @param rowGroupScan + * @param oContext + * @param columnExplorer + * @param readers + * @param implicitColumns + * @param mapWithMaxColumns Review comment: It's odd that only 1 param is described and others are empty. For a few non-obvious ones (especially the `mapWithMaxColumns`) a short description would be good. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Run-time row group pruning > -- > > Key: DRILL-7062 > URL: https://issues.apache.org/jira/browse/DRILL-7062 > Project: Apache Drill > Issue Type: Sub-task > Components: Metadata >Reporter: Venkata Jyothsna Donapati >Assignee: Boaz Ben-Zvi >Priority: Major > Fix For: 1.16.0 > > Original Estimate: 504h > Remaining Estimate: 504h > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (DRILL-7062) Run-time row group pruning
[ https://issues.apache.org/jira/browse/DRILL-7062?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16812047#comment-16812047 ] ASF GitHub Bot commented on DRILL-7062: --- amansinha100 commented on pull request #1738: DRILL-7062: Initial implementation of run-time row-group pruning URL: https://github.com/apache/drill/pull/1738#discussion_r272861849 ## File path: exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/AbstractParquetScanBatchCreator.java ## @@ -68,76 +83,131 @@ protected ScanBatch getBatch(ExecutorFragmentContext context, AbstractParquetRow List readers = new LinkedList<>(); List> implicitColumns = new ArrayList<>(); Map mapWithMaxColumns = new LinkedHashMap<>(); -for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { - /* - Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file - TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) - we should add more information to the RowGroupInfo that will be populated upon the first read to - provide the reader with all of th file meta-data it needs - These fields will be added to the constructor below - */ - try { -Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; -DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); -ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); -if (!footers.containsKey(rowGroup.getPath())) { - if (timer != null) { -timer.start(); +ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig(); +RowGroupReadEntry firstRowGroup = null; // to be scanned in case ALL row groups are pruned out +ParquetMetadata firstFooter = null; +long rowgroupsPruned = 0; // for stats + +try { + + LogicalExpression filterExpr = rowGroupScan.getFilter(); + Path selectionRoot = rowGroupScan.getSelectionRoot(); + // Runtime pruning: Avoid recomputing metadata objects for each row-group in case they use the same file + // by keeping the following objects computed earlier (relies on same file being in consecutive rowgroups) + Path prevRowGroupPath = null; + Metadata_V3.ParquetTableMetadata_v3 tableMetadataV3 = null; + Metadata_V3.ParquetFileMetadata_v3 fileMetadataV3 = null; + FileSelection fileSelection = null; + ParquetTableMetadataProviderImpl metadataProvider = null; + + for (RowGroupReadEntry rowGroup : rowGroupScan.getRowGroupReadEntries()) { +/* +Here we could store a map from file names to footers, to prevent re-reading the footer for each row group in a file +TODO - to prevent reading the footer again in the parquet record reader (it is read earlier in the ParquetStorageEngine) +we should add more information to the RowGroupInfo that will be populated upon the first read to +provide the reader with all of th file meta-data it needs +These fields will be added to the constructor below +*/ + + Stopwatch timer = logger.isTraceEnabled() ? Stopwatch.createUnstarted() : null; + DrillFileSystem fs = fsManager.get(rowGroupScan.getFsConf(rowGroup), rowGroup.getPath()); + if (!footers.containsKey(rowGroup.getPath())) { +if (timer != null) { + timer.start(); +} + +ParquetMetadata footer = readFooter(fs.getConf(), rowGroup.getPath(), readerConfig); +if (timer != null) { + long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS); + logger.trace("ParquetTrace,Read Footer,{},{},{},{},{},{},{}", "", rowGroup.getPath(), "", 0, 0, 0, timeToRead); +} +footers.put(rowGroup.getPath(), footer); } + ParquetMetadata footer = footers.get(rowGroup.getPath()); + + // + // If a filter is given (and it is not just "TRUE") - then use it to perform run-time pruning + // + if ( filterExpr != null && ! (filterExpr instanceof ValueExpressions.BooleanExpression) ) { // skip when no filter or filter is TRUE Review comment: In the check for `! (filterExpr instanceof BooleanExpression)`, the `BooleanExpression` could represent either True or False. I suppose your assumption is that if this was False, we would not have gotten to the execution stage but I think for correctness sake you should do the more specific check. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this servic