Author: ab
Date: Tue Apr 27 15:23:09 2010
New Revision: 938511

URL: http://svn.apache.org/viewvc?rev=938511&view=rev
Log:
NUTCH-814 SegmentMerger bug (Rob Bradshaw, Dennis Kubes and ab).

Added:
    lucene/nutch/trunk/src/test/org/apache/nutch/segment/
    lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java 
  (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=938511&r1=938510&r2=938511&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Apr 27 15:23:09 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.1 - 2010-04-06
 
+* NUTCH-814 SegmentMerger bug (Rob Bradshaw, ab)
+
 * NUTCH-812 Crawl.java incorrectly uses the Generator API resulting in NPE 
(Phil Barnett via mattmann and ab)
 
 * NUTCH-810 Upgrade to Tika 0.7 (jnioche)

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=938511&r1=938510&r2=938511&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue 
Apr 27 15:23:09 2010
@@ -147,7 +147,7 @@ public class SegmentMerger extends Confi
         throw new RuntimeException("Cannot identify segment:", e);
       }
       
-      final SequenceFile.Reader reader =
+      SequenceFile.Reader reader =
         new SequenceFile.Reader(FileSystem.get(job), fSplit.getPath(), job);
       
       final Writable w;
@@ -155,7 +155,15 @@ public class SegmentMerger extends Confi
         w = (Writable) reader.getValueClass().newInstance();
       } catch (Exception e) {
         throw new IOException(e.toString());
+      } finally {
+        try {
+          reader.close();
+        } catch (Exception e) {
+          // ignore
+        }
       }
+      final SequenceFileRecordReader<Text,Writable> splitReader =
+        new SequenceFileRecordReader<Text,Writable>(job, (FileSplit)split);
 
       try {
         return new SequenceFileRecordReader<Text, MetaWrapper>(job, fSplit) {
@@ -163,7 +171,7 @@ public class SegmentMerger extends Confi
           public synchronized boolean next(Text key, MetaWrapper wrapper) 
throws IOException {
             LOG.debug("Running OIF.next()");
 
-            boolean res = reader.next(key, w);
+            boolean res = splitReader.next(key, w);
             wrapper.set(w);
             wrapper.setMeta(SEGMENT_PART_KEY, spString);
             return res;
@@ -171,7 +179,7 @@ public class SegmentMerger extends Confi
           
           @Override
           public synchronized void close() throws IOException {
-            reader.close();
+            splitReader.close();
           }
           
           @Override

Added: 
lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=938511&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java 
(added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java 
Tue Apr 27 15:23:09 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.text.DecimalFormat;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSegmentMerger extends TestCase {
+  Configuration conf;
+  FileSystem fs;
+  Path testDir;
+  Path seg1;
+  Path seg2;
+  Path out;
+  int countSeg1, countSeg2;
+  
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    long blkSize = fs.getDefaultBlockSize();
+    testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + 
System.currentTimeMillis());
+    seg1 = new Path(testDir, "seg1");
+    seg2 = new Path(testDir, "seg2");
+    out = new Path(testDir, "out");
+    // create large parse-text segments
+    System.err.println("Creating large segment 1...");
+    DecimalFormat df = new DecimalFormat("0000000");
+    Text k = new Text();
+    Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
+    MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), 
Text.class, ParseText.class);
+    long curSize = 0;
+    countSeg1 = 0;
+    while (curSize < blkSize * 2) {
+      k.set("seg1-" + df.format(countSeg1));
+      w.append(k, new ParseText("seg1 text " + countSeg1));
+      countSeg1++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg1 + " records.");
+    System.err.println("Creating large segment 2...");
+    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
+    w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, 
ParseText.class);
+    curSize = 0;
+    countSeg2 = 0;
+    while (curSize < blkSize * 2) {
+      k.set("seg2-" + df.format(countSeg2));
+      w.append(k, new ParseText("seg2 text " + countSeg2));
+      countSeg2++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg2 + " records.");
+  }
+  
+  public void tearDown() throws Exception {
+    fs.delete(testDir, true);
+  }
+  
+  public void testLargeMerge() throws Exception {
+    SegmentMerger merger = new SegmentMerger(conf);
+    merger.merge(out, new Path[]{seg1, seg2}, false, false, -1);
+    // verify output
+    FileStatus[] stats = fs.listStatus(out);
+    // there should be just one path
+    assertEquals(1, stats.length);
+    Path outSeg = stats[0].getPath();
+    Text k = new Text();
+    ParseText v = new ParseText();
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new 
Path(outSeg, ParseText.DIR_NAME), conf);
+    int cnt1 = 0, cnt2 = 0;
+    for (MapFile.Reader r : readers) {
+      while (r.next(k, v)) {
+        String ks = k.toString();
+        String vs = v.getText();
+        if (ks.startsWith("seg1-")) {
+          cnt1++;
+          assertTrue(vs.startsWith("seg1 "));
+        } else if (ks.startsWith("seg2-")) {
+          cnt2++;
+          assertTrue(vs.startsWith("seg2 "));
+        }
+      }
+      r.close();
+    }
+    assertEquals(countSeg1, cnt1);
+    assertEquals(countSeg2, cnt2);
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to