Author: ab Date: Tue Apr 27 15:23:09 2010 New Revision: 938511 URL: http://svn.apache.org/viewvc?rev=938511&view=rev Log: NUTCH-814 SegmentMerger bug (Rob Bradshaw, Dennis Kubes and ab).
Added: lucene/nutch/trunk/src/test/org/apache/nutch/segment/ lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=938511&r1=938510&r2=938511&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 27 15:23:09 2010 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.1 - 2010-04-06 +* NUTCH-814 SegmentMerger bug (Rob Bradshaw, ab) + * NUTCH-812 Crawl.java incorrectly uses the Generator API resulting in NPE (Phil Barnett via mattmann and ab) * NUTCH-810 Upgrade to Tika 0.7 (jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=938511&r1=938510&r2=938511&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Apr 27 15:23:09 2010 @@ -147,7 +147,7 @@ public class SegmentMerger extends Confi throw new RuntimeException("Cannot identify segment:", e); } - final SequenceFile.Reader reader = + SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), fSplit.getPath(), job); final Writable w; @@ -155,7 +155,15 @@ public class SegmentMerger extends Confi w = (Writable) reader.getValueClass().newInstance(); } catch (Exception e) { throw new IOException(e.toString()); + } finally { + try { + reader.close(); + } catch (Exception e) { + // ignore + } } + final SequenceFileRecordReader<Text,Writable> splitReader = + new SequenceFileRecordReader<Text,Writable>(job, (FileSplit)split); try { return new SequenceFileRecordReader<Text, MetaWrapper>(job, fSplit) { @@ -163,7 +171,7 @@ public class SegmentMerger extends Confi public synchronized boolean next(Text key, MetaWrapper wrapper) throws IOException { LOG.debug("Running OIF.next()"); - boolean res = reader.next(key, w); + boolean res = splitReader.next(key, w); wrapper.set(w); wrapper.setMeta(SEGMENT_PART_KEY, spString); return res; @@ -171,7 +179,7 @@ public class SegmentMerger extends Confi @Override public synchronized void close() throws IOException { - reader.close(); + splitReader.close(); } @Override Added: lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=938511&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java Tue Apr 27 15:23:09 2010 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.segment; + +import java.text.DecimalFormat; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestSegmentMerger extends TestCase { + Configuration conf; + FileSystem fs; + Path testDir; + Path seg1; + Path seg2; + Path out; + int countSeg1, countSeg2; + + public void setUp() throws Exception { + conf = NutchConfiguration.create(); + fs = FileSystem.get(conf); + long blkSize = fs.getDefaultBlockSize(); + testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis()); + seg1 = new Path(testDir, "seg1"); + seg2 = new Path(testDir, "seg2"); + out = new Path(testDir, "out"); + // create large parse-text segments + System.err.println("Creating large segment 1..."); + DecimalFormat df = new DecimalFormat("0000000"); + Text k = new Text(); + Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000"); + MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class); + long curSize = 0; + countSeg1 = 0; + while (curSize < blkSize * 2) { + k.set("seg1-" + df.format(countSeg1)); + w.append(k, new ParseText("seg1 text " + countSeg1)); + countSeg1++; + curSize += 40; // roughly ... + } + w.close(); + System.err.println(" - done: " + countSeg1 + " records."); + System.err.println("Creating large segment 2..."); + ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000"); + w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class); + curSize = 0; + countSeg2 = 0; + while (curSize < blkSize * 2) { + k.set("seg2-" + df.format(countSeg2)); + w.append(k, new ParseText("seg2 text " + countSeg2)); + countSeg2++; + curSize += 40; // roughly ... + } + w.close(); + System.err.println(" - done: " + countSeg2 + " records."); + } + + public void tearDown() throws Exception { + fs.delete(testDir, true); + } + + public void testLargeMerge() throws Exception { + SegmentMerger merger = new SegmentMerger(conf); + merger.merge(out, new Path[]{seg1, seg2}, false, false, -1); + // verify output + FileStatus[] stats = fs.listStatus(out); + // there should be just one path + assertEquals(1, stats.length); + Path outSeg = stats[0].getPath(); + Text k = new Text(); + ParseText v = new ParseText(); + MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf); + int cnt1 = 0, cnt2 = 0; + for (MapFile.Reader r : readers) { + while (r.next(k, v)) { + String ks = k.toString(); + String vs = v.getText(); + if (ks.startsWith("seg1-")) { + cnt1++; + assertTrue(vs.startsWith("seg1 ")); + } else if (ks.startsWith("seg2-")) { + cnt2++; + assertTrue(vs.startsWith("seg2 ")); + } + } + r.close(); + } + assertEquals(countSeg1, cnt1); + assertEquals(countSeg2, cnt2); + } + +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java ------------------------------------------------------------------------------ svn:eol-style = native