Repository: hive
Updated Branches:
  refs/heads/master 4588c6076 -> 15220e8b5


HIVE-13291: ORC BI Split strategy should consider block size instead of file 
size (Prasanth Jayachandran reviewed by Gopal V)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/15220e8b
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/15220e8b
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/15220e8b

Branch: refs/heads/master
Commit: 15220e8b52bf934500ff8d98a131ae1059cfe6dc
Parents: 4588c60
Author: Prasanth Jayachandran <j.prasant...@gmail.com>
Authored: Mon Mar 21 12:31:52 2016 -0500
Committer: Prasanth Jayachandran <j.prasant...@gmail.com>
Committed: Mon Mar 21 12:31:52 2016 -0500

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 14 +--
 .../hive/ql/io/orc/TestInputOutputFormat.java   | 95 ++++++++++++++++++++
 2 files changed, 102 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 8b611bb..fe0be7b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -891,7 +891,6 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     private final boolean isOriginal;
     private final List<DeltaMetaData> deltas;
     private final FileSystem fs;
-    private final Context context;
     private final Path dir;
     private final boolean allowSyntheticFileIds;
 
@@ -899,7 +898,6 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
         Path dir, List<HdfsFileStatusWithId> fileStatuses, boolean isOriginal,
         List<DeltaMetaData> deltas, boolean[] covered, boolean 
allowSyntheticFileIds) {
       super(dir, context.numBuckets, deltas, covered);
-      this.context = context;
       this.fileStatuses = fileStatuses;
       this.isOriginal = isOriginal;
       this.deltas = deltas;
@@ -914,15 +912,17 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
       for (HdfsFileStatusWithId file : fileStatuses) {
         FileStatus fileStatus = file.getFileStatus();
         if (fileStatus.getLen() != 0) {
-          String[] hosts = SHIMS.getLocationsWithOffset(fs, 
fileStatus).firstEntry().getValue()
-              .getHosts();
           Object fileKey = file.getFileId();
           if (fileKey == null && allowSyntheticFileIds) {
             fileKey = new SyntheticFileId(fileStatus);
           }
-          OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 0,
-              fileStatus.getLen(), hosts, null, isOriginal, true, deltas, -1);
-          splits.add(orcSplit);
+          TreeMap<Long, BlockLocation> blockOffsets = 
SHIMS.getLocationsWithOffset(fs, fileStatus);
+          for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) 
{
+            OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, 
entry.getKey(),
+                entry.getValue().getLength(), entry.getValue().getHosts(), 
null, isOriginal, true,
+                deltas, -1);
+            splits.add(orcSplit);
+          }
         }
       }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/15220e8b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java 
b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
index 1a64f3a..c88f6d8 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -555,6 +555,101 @@ public class TestInputOutputFormat {
   }
 
   @Test
+  public void testBIStrategySplitBlockBoundary() throws Exception {
+    conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
+    OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
+    MockFileSystem fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/b/part-00", 1000, new byte[1], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[1], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-02", 1000, new byte[1], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-03", 1000, new byte[1], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[1], new 
MockBlock("host1", "host2")));
+    OrcInputFormat.FileGenerator gen =
+        new OrcInputFormat.FileGenerator(context, fs,
+            new MockPath(fs, "mock:/a/b"), false, null);
+    OrcInputFormat.SplitStrategy splitStrategy = createSplitStrategy(context, 
gen);
+    assertEquals(true, splitStrategy instanceof 
OrcInputFormat.BISplitStrategy);
+    List<OrcSplit> splits = splitStrategy.getSplits();
+    int numSplits = splits.size();
+    assertEquals(5, numSplits);
+
+    context = new OrcInputFormat.Context(conf);
+    fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new 
MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new 
MockBlock("host1", "host2")));
+    gen = new OrcInputFormat.FileGenerator(context, fs,
+        new MockPath(fs, "mock:/a/b"), false, null);
+    splitStrategy = createSplitStrategy(context, gen);
+    assertEquals(true, splitStrategy instanceof 
OrcInputFormat.BISplitStrategy);
+    splits = splitStrategy.getSplits();
+    numSplits = splits.size();
+    assertEquals(5, numSplits);
+
+    context = new OrcInputFormat.Context(conf);
+    fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")));
+    gen = new OrcInputFormat.FileGenerator(context, fs,
+        new MockPath(fs, "mock:/a/b"), false, null);
+    splitStrategy = createSplitStrategy(context, gen);
+    assertEquals(true, splitStrategy instanceof 
OrcInputFormat.BISplitStrategy);
+    splits = splitStrategy.getSplits();
+    numSplits = splits.size();
+    assertEquals(10, numSplits);
+
+    context = new OrcInputFormat.Context(conf);
+    fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2")));
+    gen = new OrcInputFormat.FileGenerator(context, fs,
+        new MockPath(fs, "mock:/a/b"), false, null);
+    splitStrategy = createSplitStrategy(context, gen);
+    assertEquals(true, splitStrategy instanceof 
OrcInputFormat.BISplitStrategy);
+    splits = splitStrategy.getSplits();
+    numSplits = splits.size();
+    assertEquals(10, numSplits);
+
+    context = new OrcInputFormat.Context(conf);
+    fs = new MockFileSystem(conf,
+        new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2"), new MockBlock("host1", "host2")),
+        new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new 
MockBlock("host1", "host2"),
+            new MockBlock("host1", "host2"), new MockBlock("host1", "host2")));
+    gen = new OrcInputFormat.FileGenerator(context, fs,
+        new MockPath(fs, "mock:/a/b"), false, null);
+    splitStrategy = createSplitStrategy(context, gen);
+    assertEquals(true, splitStrategy instanceof 
OrcInputFormat.BISplitStrategy);
+    splits = splitStrategy.getSplits();
+    numSplits = splits.size();
+    assertEquals(15, numSplits);
+  }
+
+  @Test
   public void testEtlCombinedStrategy() throws Exception {
     conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL");
     conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, 
"1000000");

Reply via email to