DRILL-4380: Fix performance regression: in creation of FileSelection in ParquetFormatPlugin to not set files if metadata cache is available. This closes #369
This change is slightly modified from the version in the master branch, as the 1.5 release branch did not include the Guava version update. The change simply uses the old constructor pattern for creating StopWatch objects. Also backported the change of the FileSelection object constructor to public access. As stated in the DRILL-4380 discussion the usage patterns of creating FileSelections and a clarification of the API will be covered by DRILL-4381. Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/921bb4f0 Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/921bb4f0 Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/921bb4f0 Branch: refs/heads/1.5.0 Commit: 921bb4f0293f01ecb021bbaeecc7ebe496a7e559 Parents: a9383ee Author: Parth Chandra <par...@apache.org> Authored: Thu Dec 17 16:30:42 2015 -0800 Committer: Jason Altekruse <altekruseja...@gmail.com> Committed: Tue Feb 9 15:19:25 2016 -0800 ---------------------------------------------------------------------- .../drill/exec/store/dfs/FileSelection.java | 23 ++++++++++++++++---- .../exec/store/parquet/ParquetFormatPlugin.java | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/921bb4f0/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java index 6df3ffc..c384401 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/dfs/FileSelection.java @@ -20,11 +20,13 @@ package org.apache.drill.exec.store.dfs; import java.io.IOException; import java.net.URI; import java.util.List; +import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import javax.annotation.Nullable; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; +import com.google.common.base.Stopwatch; import com.google.common.base.Strings; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; @@ -52,7 +54,7 @@ public class FileSelection { * @param files list of files * @param selectionRoot root path for selections */ - protected FileSelection(final List<FileStatus> statuses, final List<String> files, final String selectionRoot) { + public FileSelection(final List<FileStatus> statuses, final List<String> files, final String selectionRoot) { this.statuses = statuses; this.files = files; this.selectionRoot = Preconditions.checkNotNull(selectionRoot); @@ -73,13 +75,18 @@ public class FileSelection { } public List<FileStatus> getStatuses(final DrillFileSystem fs) throws IOException { - if (statuses == null) { + Stopwatch timer = new Stopwatch().start(); + + if (statuses == null) { final List<FileStatus> newStatuses = Lists.newArrayList(); for (final String pathStr:files) { newStatuses.add(fs.getFileStatus(new Path(pathStr))); } statuses = newStatuses; } + logger.debug("FileSelection.getStatuses() took {} ms, numFiles: {}", + timer.elapsed(TimeUnit.MILLISECONDS), statuses == null ? 0 : statuses.size()); + return statuses; } @@ -104,6 +111,7 @@ public class FileSelection { } public FileSelection minusDirectories(DrillFileSystem fs) throws IOException { + Stopwatch timer = new Stopwatch().start(); final List<FileStatus> statuses = getStatuses(fs); final int total = statuses.size(); final Path[] paths = new Path[total]; @@ -118,7 +126,10 @@ public class FileSelection { } })); - return create(nonDirectories, null, selectionRoot); + final FileSelection fileSel = create(nonDirectories, null, selectionRoot); + logger.debug("FileSelection.minusDirectories() took {} ms, numFiles: {}", + timer.elapsed(TimeUnit.MILLISECONDS), total); + return fileSel; } public FileStatus getFirstPath(DrillFileSystem fs) throws IOException { @@ -183,12 +194,16 @@ public class FileSelection { } public static FileSelection create(final DrillFileSystem fs, final String parent, final String path) throws IOException { + Stopwatch timer = new Stopwatch().start(); final Path combined = new Path(parent, removeLeadingSlash(path)); final FileStatus[] statuses = fs.globStatus(combined); if (statuses == null) { return null; } - return create(Lists.newArrayList(statuses), null, combined.toUri().toString()); + final FileSelection fileSel = create(Lists.newArrayList(statuses), null, combined.toUri().toString()); + logger.debug("FileSelection.create() took {} ms ", timer.elapsed(TimeUnit.MILLISECONDS)); + return fileSel; + } /** http://git-wip-us.apache.org/repos/asf/drill/blob/921bb4f0/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java index e2cc670..a924bea 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/ParquetFormatPlugin.java @@ -233,7 +233,7 @@ public class ParquetFormatPlugin implements FormatPlugin{ // /a/b/c.parquet and the format of the selection root must match that of the file names // otherwise downstream operations such as partition pruning can break. final Path metaRootPath = Path.getPathWithoutSchemeAndAuthority(metaRootDir.getPath()); - final FileSelection newSelection = FileSelection.create(null, fileNames, metaRootPath.toString()); + final FileSelection newSelection = new FileSelection(selection.getStatuses(fs), fileNames, metaRootPath.toString()); return ParquetFileSelection.create(newSelection, metadata); } else { // don't expand yet; ParquetGroupScan's metadata gathering operation