Author: dogacan Date: Thu Jul 26 01:44:33 2007 New Revision: 559754 URL: http://svn.apache.org/viewvc?view=rev&rev=559754 Log: NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException when trying to rerun dedup on a segment. Contributed by Vishal Shah.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=559754&r1=559753&r2=559754 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jul 26 01:44:33 2007 @@ -99,6 +99,9 @@ 33. NUTCH-516 - Next fetch time is not set when it is a CrawlDatum.STATUS_FETCH_GONE. (Emmanuel Joke via dogacan) +34. NUTCH-525 - DeleteDuplicates generates ArrayIndexOutOfBoundsException + when trying to rerun dedup on a segment. (Vishal Shah via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jul 26 01:44:33 2007 @@ -182,7 +182,7 @@ return false; // skip deleted documents - while (indexReader.isDeleted(doc) && doc < maxDoc) doc++; + while (doc < maxDoc && indexReader.isDeleted(doc)) doc++; if (doc >= maxDoc) return false; Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diff&rev=559754&r1=559753&r2=559754 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Thu Jul 26 01:44:33 2007 @@ -42,6 +42,8 @@ Path index1; Path index2; Path index3; + Path index4; + Path index5; public void setUp() throws Exception { conf = NutchConfiguration.create(); @@ -52,6 +54,8 @@ index1 = createIndex("index1", true, 1.0f, 10L, false); index2 = createIndex("index2", false, 2.0f, 20L, true); index3 = createIndex("index3", true, 1.0f, 10L, true); + index4 = createSingleDocIndex("index4", 1.0f, 10L); + index5 = createSingleDocIndex("index5", 1.0f, 20L); } private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { @@ -80,6 +84,20 @@ return idx; } + private Path createSingleDocIndex(String name, float inc, long time) throws Exception { + Path idx = new Path(root, name); + Path sub = new Path(idx, "part-0000"); + Directory dir = FSDirectory.getDirectory(sub.toString()); + IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); + Document doc = makeDoc(name, + MD5Hash.digest("1").toString(), + "http://www.example.com/1", + 1.0f + inc, time + 1); + writer.addDocument(doc); + writer.close(); + return idx; + } + private Document makeDoc(String segment, String digest, String url, float boost, long time) { Document doc = new Document(); doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO)); @@ -171,6 +189,12 @@ System.out.println(doc); } reader.close(); + } + + public void testRededuplicate() throws Exception { + DeleteDuplicates dedup = new DeleteDuplicates(conf); + dedup.dedup(new Path[]{index4, index5}); + dedup.dedup(new Path[]{index4, index5}); } } ------------------------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Still grepping through log files to find problems? Stop. Now Search log events and configuration files using AJAX and a browser. Download your FREE copy of Splunk now >> http://get.splunk.com/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs