Repository: hive Updated Branches: refs/heads/master 4588c6076 -> 15220e8b5
HIVE-13291: ORC BI Split strategy should consider block size instead of file size (Prasanth Jayachandran reviewed by Gopal V) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/15220e8b Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/15220e8b Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/15220e8b Branch: refs/heads/master Commit: 15220e8b52bf934500ff8d98a131ae1059cfe6dc Parents: 4588c60 Author: Prasanth Jayachandran <j.prasant...@gmail.com> Authored: Mon Mar 21 12:31:52 2016 -0500 Committer: Prasanth Jayachandran <j.prasant...@gmail.com> Committed: Mon Mar 21 12:31:52 2016 -0500 ---------------------------------------------------------------------- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 14 +-- .../hive/ql/io/orc/TestInputOutputFormat.java | 95 ++++++++++++++++++++ 2 files changed, 102 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 8b611bb..fe0be7b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -891,7 +891,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, private final boolean isOriginal; private final List<DeltaMetaData> deltas; private final FileSystem fs; - private final Context context; private final Path dir; private final boolean allowSyntheticFileIds; @@ -899,7 +898,6 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, Path dir, List<HdfsFileStatusWithId> fileStatuses, boolean isOriginal, List<DeltaMetaData> deltas, boolean[] covered, boolean allowSyntheticFileIds) { super(dir, context.numBuckets, deltas, covered); - this.context = context; this.fileStatuses = fileStatuses; this.isOriginal = isOriginal; this.deltas = deltas; @@ -914,15 +912,17 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, for (HdfsFileStatusWithId file : fileStatuses) { FileStatus fileStatus = file.getFileStatus(); if (fileStatus.getLen() != 0) { - String[] hosts = SHIMS.getLocationsWithOffset(fs, fileStatus).firstEntry().getValue() - .getHosts(); Object fileKey = file.getFileId(); if (fileKey == null && allowSyntheticFileIds) { fileKey = new SyntheticFileId(fileStatus); } - OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 0, - fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1); - splits.add(orcSplit); + TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus); + for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) { + OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(), + entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true, + deltas, -1); + splits.add(orcSplit); + } } } http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 1a64f3a..c88f6d8 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -555,6 +555,101 @@ public class TestInputOutputFormat { } @Test + public void testBIStrategySplitBlockBoundary() throws Exception { + conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); + OrcInputFormat.Context context = new OrcInputFormat.Context(conf); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2"))); + OrcInputFormat.FileGenerator gen = + new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a/b"), false, null); + OrcInputFormat.SplitStrategy splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy); + List<OrcSplit> splits = splitStrategy.getSplits(); + int numSplits = splits.size(); + assertEquals(5, numSplits); + + context = new OrcInputFormat.Context(conf); + fs = new MockFileSystem(conf, + new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2"))); + gen = new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a/b"), false, null); + splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy); + splits = splitStrategy.getSplits(); + numSplits = splits.size(); + assertEquals(5, numSplits); + + context = new OrcInputFormat.Context(conf); + fs = new MockFileSystem(conf, + new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"))); + gen = new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a/b"), false, null); + splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy); + splits = splitStrategy.getSplits(); + numSplits = splits.size(); + assertEquals(10, numSplits); + + context = new OrcInputFormat.Context(conf); + fs = new MockFileSystem(conf, + new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"))); + gen = new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a/b"), false, null); + splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy); + splits = splitStrategy.getSplits(); + numSplits = splits.size(); + assertEquals(10, numSplits); + + context = new OrcInputFormat.Context(conf); + fs = new MockFileSystem(conf, + new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), + new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"), + new MockBlock("host1", "host2"), new MockBlock("host1", "host2"))); + gen = new OrcInputFormat.FileGenerator(context, fs, + new MockPath(fs, "mock:/a/b"), false, null); + splitStrategy = createSplitStrategy(context, gen); + assertEquals(true, splitStrategy instanceof OrcInputFormat.BISplitStrategy); + splits = splitStrategy.getSplits(); + numSplits = splits.size(); + assertEquals(15, numSplits); + } + + @Test public void testEtlCombinedStrategy() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL"); conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000");