HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly created 
stream is equal to start of split. Contributed by Ankit Kamboj


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1c942bc5
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1c942bc5
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1c942bc5

Branch: refs/heads/HDFS-EC
Commit: 1c942bc5964b0eecd1eac5c08a221985df41c9a3
Parents: fa505d1
Author: Jason Lowe <[email protected]>
Authored: Tue Jan 6 21:19:10 2015 +0000
Committer: Zhe Zhang <[email protected]>
Committed: Mon Jan 12 10:17:59 2015 -0800

----------------------------------------------------------------------
 hadoop-common-project/hadoop-common/CHANGES.txt |  3 +++
 .../apache/hadoop/io/compress/BZip2Codec.java   |  2 +-
 .../hadoop/mapred/TestLineRecordReader.java     | 21 ++++++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/1c942bc5/hadoop-common-project/hadoop-common/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt 
b/hadoop-common-project/hadoop-common/CHANGES.txt
index e7a2061..49438aa 100644
--- a/hadoop-common-project/hadoop-common/CHANGES.txt
+++ b/hadoop-common-project/hadoop-common/CHANGES.txt
@@ -677,6 +677,9 @@ Release 2.7.0 - UNRELEASED
     HADOOP-11459. Fix recent findbugs in ActiveStandbyElector, NetUtils
     and ShellBasedIdMapping (vinayakumarb)
 
+    HADOOP-11445. Bzip2Codec: Data block is skipped when position of newly
+    created stream is equal to start of split (Ankit Kamboj via jlowe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1c942bc5/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
----------------------------------------------------------------------
diff --git 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
index 91178ec..2c5a7be 100644
--- 
a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
+++ 
b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/BZip2Codec.java
@@ -225,7 +225,7 @@ public class BZip2Codec implements Configurable, 
SplittableCompressionCodec {
     // ........................................^^[We align at wrong position!]
     // ...........................................................^^[While 
this pos is correct]
 
-    if (in.getPos() <= start) {
+    if (in.getPos() < start) {
       ((Seekable)seekableIn).seek(start);
       in = new BZip2CompressionInputStream(seekableIn, start, end, readMode);
     }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1c942bc5/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
----------------------------------------------------------------------
diff --git 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
index a7a87c9..4c94e59 100644
--- 
a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
+++ 
b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java
@@ -106,6 +106,27 @@ public class TestLineRecordReader {
     testSplitRecords("blockEndingInCRThenLF.txt.bz2", 136498);
   }
 
+  //This test ensures record reader doesn't lose records when it starts
+  //exactly at the starting byte of a bz2 compressed block
+  @Test
+  public void testBzip2SplitStartAtBlockMarker() throws IOException {
+    //136504 in blockEndingInCR.txt.bz2 is the byte at which the bz2 block ends
+    //In the following test cases record readers should iterate over all the 
records
+    //and should not miss any record.
+
+    //Start next split at just the start of the block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136504);
+
+    //Start next split a byte forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136505);
+
+    //Start next split 3 bytes forward in next block.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136508);
+
+    //Start next split 10 bytes from behind the end marker.
+    testSplitRecords("blockEndingInCR.txt.bz2", 136494);
+  }
+
   // Use the LineRecordReader to read records from the file
   public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
       throws IOException {

Reply via email to