Repository: hadoop Updated Branches: refs/heads/branch-2.7 1918da8fc -> 7ded648ae
MAPREDUCE-6558. multibyte delimiters with compressed input files generate duplicate records. Contributed by Wilfred Spiegelenburg (cherry picked from commit 9227dfc25f373a99cb66ad7d6bacef8dcf336f77) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/7ded648a Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/7ded648a Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/7ded648a Branch: refs/heads/branch-2.7 Commit: 7ded648aee215a77bc2db4207b21ab89afcfa43b Parents: 1918da8 Author: Jason Lowe <jl...@apache.org> Authored: Fri May 13 14:37:55 2016 +0000 Committer: Jason Lowe <jl...@apache.org> Committed: Fri May 13 14:37:55 2016 +0000 ---------------------------------------------------------------------- hadoop-mapreduce-project/CHANGES.txt | 3 ++ .../lib/input/CompressedSplitLineReader.java | 5 ++++ .../hadoop/mapred/TestLineRecordReader.java | 29 +++++++++++++++++++ .../lib/input/TestLineRecordReader.java | 29 +++++++++++++++++++ .../compressedMultibyteDelimiter.txt.bz2 | Bin 0 -> 1096 bytes 5 files changed, 66 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/7ded648a/hadoop-mapreduce-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index eee49a9..5aef3b2 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -84,6 +84,9 @@ Release 2.7.3 - UNRELEASED MAPREDUCE-6513. Job got hanged forever when one NM unstable for some time. Contributed by Varun Saxena & Wangda Tan + MAPREDUCE-6558. multibyte delimiters with compressed input files generate + duplicate records (Wilfred Spiegelenburg via jlowe) + Release 2.7.2 - 2016-01-25 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/7ded648a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/CompressedSplitLineReader.java ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/CompressedSplitLineReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/CompressedSplitLineReader.java index ef51f5c..9d0e949 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/CompressedSplitLineReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/CompressedSplitLineReader.java @@ -165,4 +165,9 @@ public class CompressedSplitLineReader extends SplitLineReader { public boolean needAdditionalRecordAfterSplit() { return !finished && needAdditionalRecord; } + + @Override + protected void unsetNeedAdditionalRecordAfterSplit() { + needAdditionalRecord = false; + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/7ded648a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java index 986a2b2..471ea79 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapred/TestLineRecordReader.java @@ -647,4 +647,33 @@ public class TestLineRecordReader { assertFalse(reader.next(key, value)); assertEquals(12, reader.getPos()); } + + @Test + public void testBzipWithMultibyteDelimiter() throws IOException { + String testFileName = "compressedMultibyteDelimiter.txt.bz2"; + // firstSplitLength < (headers + blockMarker) will pass always since no + // records will be read (in the test file that is byte 0..9) + // firstSplitlength > (compressed file length - one compressed block + // size + 1) will also always pass since the second split will be empty + // (833 bytes is the last block start in the used data file) + int firstSplitLength = 100; + URL testFileUrl = getClass().getClassLoader().getResource(testFileName); + assertNotNull("Cannot find " + testFileName, testFileUrl); + File testFile = new File(testFileUrl.getFile()); + long testFileSize = testFile.length(); + Path testFilePath = new Path(testFile.getAbsolutePath()); + assertTrue("Split size is smaller than header length", + firstSplitLength > 9); + assertTrue("Split size is larger than compressed file size " + + testFilePath, testFileSize > firstSplitLength); + + Configuration conf = new Configuration(); + conf.setInt(org.apache.hadoop.mapreduce.lib.input. + LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); + + String delimiter = "<E-LINE>\r\r\n"; + conf.set("textinputformat.record.delimiter", delimiter); + testSplitRecordsForFile(conf, firstSplitLength, testFileSize, + testFilePath); + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/7ded648a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java index 354b0b1..ccf8c93 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/java/org/apache/hadoop/mapreduce/lib/input/TestLineRecordReader.java @@ -610,4 +610,33 @@ public class TestLineRecordReader { // Key should be 12 right after "123456789\r\r\n" assertEquals(12, key.get()); } + + @Test + public void testBzipWithMultibyteDelimiter() throws IOException { + String testFileName = "compressedMultibyteDelimiter.txt.bz2"; + // firstSplitLength < (headers + blockMarker) will pass always since no + // records will be read (in the test file that is byte 0..9) + // firstSplitlength > (compressed file length - one compressed block + // size + 1) will also always pass since the second split will be empty + // (833 bytes is the last block start in the used data file) + int firstSplitLength = 100; + URL testFileUrl = getClass().getClassLoader().getResource(testFileName); + assertNotNull("Cannot find " + testFileName, testFileUrl); + File testFile = new File(testFileUrl.getFile()); + long testFileSize = testFile.length(); + Path testFilePath = new Path(testFile.getAbsolutePath()); + assertTrue("Split size is smaller than header length", + firstSplitLength > 9); + assertTrue("Split size is larger than compressed file size " + + testFilePath, testFileSize > firstSplitLength); + + Configuration conf = new Configuration(); + conf.setInt(org.apache.hadoop.mapreduce.lib.input. + LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); + + String delimiter = "<E-LINE>\r\r\n"; + conf.set("textinputformat.record.delimiter", delimiter); + testSplitRecordsForFile(conf, firstSplitLength, testFileSize, + testFilePath); + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/7ded648a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/compressedMultibyteDelimiter.txt.bz2 ---------------------------------------------------------------------- diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/compressedMultibyteDelimiter.txt.bz2 b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/compressedMultibyteDelimiter.txt.bz2 new file mode 100644 index 0000000..f8e178f Binary files /dev/null and b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/test/resources/compressedMultibyteDelimiter.txt.bz2 differ --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org