Author: dogacan
Date: Thu Jul 26 01:44:33 2007
New Revision: 559754

URL: http://svn.apache.org/viewvc?view=rev&rev=559754
Log:
NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException when 
trying to rerun dedup on a segment. Contributed by Vishal Shah.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jul 26 01:44:33 2007
@@ -99,6 +99,9 @@
 33. NUTCH-516 - Next fetch time is not set when it is a 
     CrawlDatum.STATUS_FETCH_GONE. (Emmanuel Joke via dogacan)
 
+34. NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException 
+    when trying to rerun dedup on a segment. (Vishal Shah via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Thu Jul 26 01:44:33 2007
@@ -182,7 +182,7 @@
           return false;
 
         // skip deleted documents
-        while (indexReader.isDeleted(doc) && doc < maxDoc) doc++;
+        while (doc < maxDoc && indexReader.isDeleted(doc)) doc++;
         if (doc >= maxDoc)
           return false;
 

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754
==============================================================================
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java 
(original)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java 
Thu Jul 26 01:44:33 2007
@@ -42,6 +42,8 @@
   Path index1;
   Path index2;
   Path index3;
+  Path index4;
+  Path index5;
   
   public void setUp() throws Exception {
     conf = NutchConfiguration.create();
@@ -52,6 +54,8 @@
     index1 = createIndex("index1", true, 1.0f, 10L, false);
     index2 = createIndex("index2", false, 2.0f, 20L, true);
     index3 = createIndex("index3", true, 1.0f, 10L, true);
+    index4 = createSingleDocIndex("index4", 1.0f, 10L);
+    index5 = createSingleDocIndex("index5", 1.0f, 20L);
   }
   
   private Path createIndex(String name, boolean hashDup, float inc, long time, 
boolean incFirst) throws Exception {
@@ -80,6 +84,20 @@
     return idx;
   }
   
+  private Path createSingleDocIndex(String name, float inc, long time) throws 
Exception {
+    Path idx = new Path(root, name);
+    Path sub = new Path(idx, "part-0000");
+    Directory dir = FSDirectory.getDirectory(sub.toString());
+    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), 
true);
+    Document doc = makeDoc(name,
+        MD5Hash.digest("1").toString(),
+        "http://www.example.com/1";,
+       1.0f + inc, time + 1);
+    writer.addDocument(doc);
+    writer.close();
+    return idx;
+  }
+  
   private Document makeDoc(String segment, String digest, String url, float 
boost, long time) {
     Document doc = new Document();
     doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
@@ -171,6 +189,12 @@
       System.out.println(doc);
     }
     reader.close();
+  }
+  
+  public void testRededuplicate() throws Exception {
+    DeleteDuplicates dedup = new DeleteDuplicates(conf);
+    dedup.dedup(new Path[]{index4, index5});
+    dedup.dedup(new Path[]{index4, index5});
   }
   
 }



-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >>  http://get.splunk.com/
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to