svn commit: r938511 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java src/test/org/apache/nutch/segment/ src/test/org/apache/nutch/segment/TestSegmentMerger.ja
Author: ab Date: Tue Apr 27 15:23:09 2010 New Revision: 938511 URL: http://svn.apache.org/viewvc?rev=938511view=rev Log: NUTCH-814 SegmentMerger bug (Rob Bradshaw, Dennis Kubes and ab). Added: lucene/nutch/trunk/src/test/org/apache/nutch/segment/ lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=938511r1=938510r2=938511view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 27 15:23:09 2010 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.1 - 2010-04-06 +* NUTCH-814 SegmentMerger bug (Rob Bradshaw, ab) + * NUTCH-812 Crawl.java incorrectly uses the Generator API resulting in NPE (Phil Barnett via mattmann and ab) * NUTCH-810 Upgrade to Tika 0.7 (jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=938511r1=938510r2=938511view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Apr 27 15:23:09 2010 @@ -147,7 +147,7 @@ public class SegmentMerger extends Confi throw new RuntimeException(Cannot identify segment:, e); } - final SequenceFile.Reader reader = + SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), fSplit.getPath(), job); final Writable w; @@ -155,7 +155,15 @@ public class SegmentMerger extends Confi w = (Writable) reader.getValueClass().newInstance(); } catch (Exception e) { throw new IOException(e.toString()); + } finally { +try { + reader.close(); +} catch (Exception e) { + // ignore +} } + final SequenceFileRecordReaderText,Writable splitReader = +new SequenceFileRecordReaderText,Writable(job, (FileSplit)split); try { return new SequenceFileRecordReaderText, MetaWrapper(job, fSplit) { @@ -163,7 +171,7 @@ public class SegmentMerger extends Confi public synchronized boolean next(Text key, MetaWrapper wrapper) throws IOException { LOG.debug(Running OIF.next()); -boolean res = reader.next(key, w); +boolean res = splitReader.next(key, w); wrapper.set(w); wrapper.setMeta(SEGMENT_PART_KEY, spString); return res; @@ -171,7 +179,7 @@ public class SegmentMerger extends Confi @Override public synchronized void close() throws IOException { -reader.close(); +splitReader.close(); } @Override Added: lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=938511view=auto == --- lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java Tue Apr 27 15:23:09 2010 @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.segment; + +import java.text.DecimalFormat; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class
svn commit: r938586 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
Author: ab Date: Tue Apr 27 18:06:10 2010 New Revision: 938586 URL: http://svn.apache.org/viewvc?rev=938586view=rev Log: NUTCH-815 Invalid blank line before If-Modified-Since header. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=938586r1=938585r2=938586view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 27 18:06:10 2010 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.1 - 2010-04-06 +* NUTCH-815 Invalid blank line before If-Modified-Since header (Pascal Dimassimo via ab) + * NUTCH-814 SegmentMerger bug (Rob Bradshaw, ab) * NUTCH-812 Crawl.java incorrectly uses the Generator API resulting in NPE (Phil Barnett via mattmann and ab) Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=938586r1=938585r2=938586view=diff == --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Apr 27 18:06:10 2010 @@ -128,11 +128,11 @@ public class HttpResponse implements Res reqStr.append(this.http.getAcceptLanguage()); reqStr.append(\r\n); - reqStr.append(\r\n); if (datum.getModifiedTime() 0) { reqStr.append(If-Modified-Since: + HttpDateFormat.toString(datum.getModifiedTime())); reqStr.append(\r\n); } + reqStr.append(\r\n); byte[] reqBytes= reqStr.toString().getBytes();
svn commit: r925179 [2/2] - in /lucene/nutch/trunk: ./ lib/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/field/ src/java/org/apache/nutch/in
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=925179r1=925178r2=925179view=diff == --- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java Fri Mar 19 11:34:33 2010 @@ -20,6 +20,7 @@ package org.creativecommons.nutch; import org.apache.nutch.indexer.Indexer; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.document.Document; import org.apache.commons.logging.Log; @@ -83,7 +84,7 @@ public class CCDeleteUnlicensedTool { File indexDone = new File(directories[i], Indexer.DONE_NAME); if (indexDone.exists() indexDone.isFile()){ File indexDir = new File(directories[i], index); - IndexReader reader = IndexReader.open(indexDir); + IndexReader reader = IndexReader.open(FSDirectory.open(indexDir)); maxDoc += reader.maxDoc(); vReaders.add(reader); } Modified: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=925179r1=925178r2=925179view=diff == --- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (original) +++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Fri Mar 19 11:34:33 2010 @@ -80,8 +80,10 @@ public class BasicFieldFilter // create lucene fields from the FieldWritable objects Field.Store store = field.isStored() ? Field.Store.YES : Field.Store.NO; - Field.Index indexed = field.isIndexed() ? field.isTokenized() -? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED : Field.Index.NO; + Field.Index indexed = + field.isIndexed() + ? field.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED + : Field.Index.NO; Field docField = new Field(fieldName, field.getValue(), store, indexed); Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar?rev=925179view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-3.0.1.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=925179r1=925178r2=925179view=diff == --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Fri Mar 19 11:34:33 2010 @@ -25,11 +25,11 @@ plugin id=lib-lucene-analyzers name=Lucene Analysers - version=2.9.1 + version=3.0.1 provider-name=org.apache.lucene runtime - library name=lucene-analyzers-2.9.1.jar + library name=lucene-analyzers-3.0.1.jar export name=*/ /library /runtime Modified: lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java?rev=925179r1=925178r2=925179view=diff == --- lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java (original) +++ lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java Fri Mar 19 11:34:33 2010 @@ -29,8 +29,7 @@ import org.apache.commons.logging.LogFac import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.RangeQuery; -import org.apache.lucene.index.Term; +import org.apache.lucene.search.TermRangeQuery; import java.util.regex.Pattern; import java.util.regex.Matcher;
svn commit: r925186 - in /lucene/nutch/trunk: ./ lib/ lib/native/Linux-amd64-64/ lib/native/Linux-i386-32/
Author: ab Date: Fri Mar 19 11:52:26 2010 New Revision: 925186 URL: http://svn.apache.org/viewvc?rev=925186view=rev Log: NUTCH-803 Upgrade to Hadoop 0.20.2. Added: lucene/nutch/trunk/lib/hadoop-0.20.2-core.jar (with props) lucene/nutch/trunk/lib/hadoop-0.20.2-tools.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.20.1-core.jar lucene/nutch/trunk/lib/hadoop-0.20.1-tools.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=925186r1=925185r2=925186view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 19 11:52:26 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) + * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab) * NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging (ab) Added: lucene/nutch/trunk/lib/hadoop-0.20.2-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.20.2-core.jar?rev=925186view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.20.2-core.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/hadoop-0.20.2-tools.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.20.2-tools.jar?rev=925186view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.20.2-tools.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?rev=925186r1=925185r2=925186view=diff == Binary files - no diff available.
svn commit: r924945 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/DistributedSearchBean.java src/java/org/apache/nutch/searcher/LuceneSearchBean.java
Author: ab Date: Thu Mar 18 18:44:45 2010 New Revision: 924945 URL: http://svn.apache.org/viewvc?rev=924945view=rev Log: NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=924945r1=924944r2=924945view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 18 18:44:45 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-796 Zero results problems difficult to troubleshoot due to lack of logging (ab) + * NUTCH-801 Remove RTF and MP3 parse plugins (jnioche) * NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java?rev=924945r1=924944r2=924945view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearchBean.java Thu Mar 18 18:44:45 2010 @@ -140,12 +140,17 @@ public class DistributedSearchBean imple ListSearchBean beanList = new ArrayListSearchBean(); if (fs.exists(luceneConfig)) { + LOG.info(Adding Nutch searchers in + + luceneConfig.makeQualified(fs).toUri()); addLuceneBeans(beanList, luceneConfig, conf); } if (fs.exists(solrConfig)) { + LOG.info(Adding Solr searchers in + + solrConfig.makeQualified(fs).toUri()); addSolrBeans(beanList, solrConfig, conf); } +LOG.info(Added + beanList.size() + remote searchers.); beans = beanList.toArray(new SearchBean[beanList.size()]); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java?rev=924945r1=924944r2=924945view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneSearchBean.java Thu Mar 18 18:44:45 2010 @@ -53,12 +53,19 @@ public class LuceneSearchBean implements private void init(Path indexDir, Path indexesDir) throws IOException { +Path absIndexDir = indexDir.makeQualified(indexDir.getFileSystem(conf)); +Path absIndexesDir = indexesDir.makeQualified(indexesDir.getFileSystem(conf)); if (this.fs.exists(indexDir)) { - LOG.info(opening merged index in + indexDir); + LOG.info(opening merged index in + absIndexDir.toUri()); this.searcher = new IndexSearcher(indexDir, this.conf); } else { - LOG.info(opening indexes in + indexesDir); - + if (!this.fs.exists(indexesDir)) { +// should throw exception ? +LOG.warn(Neither + absIndexDir.toUri() + nor + +absIndexesDir.toUri() + found!); + } else { +LOG.info(opening indexes in + absIndexesDir.toUri()); + } ListPath vDirs = new ArrayListPath(); FileStatus[] fstats = fs.listStatus(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] directories = HadoopFSUtil.getPaths(fstats);
svn commit: r887151 - /lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
Author: ab Date: Fri Dec 4 10:27:03 2009 New Revision: 887151 URL: http://svn.apache.org/viewvc?rev=887151view=rev Log: NUTCH-767 Fix a failing test - still needs more work. Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=887151r1=887150r2=887151view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Fri Dec 4 10:27:03 2009 @@ -63,19 +63,28 @@ http://www.foo.com/;, .getBytes(UTF8), text/html; charset=UTF-8, p, conf); -assertEquals(text/html, c.getContentType()); +// TODO check potential Tika issue and +// revert the expected value to text/html +// see https://issues.apache.org/jira/browse/NUTCH-767 +assertEquals(text/plain, c.getContentType()); c = new Content(http://www.foo.com/foo.html;, http://www.foo.com/;, .getBytes(UTF8), , p, conf); -assertEquals(text/html, c.getContentType()); +// TODO check potential Tika issue and +// revert the expected value to text/html +// see https://issues.apache.org/jira/browse/NUTCH-767 +assertEquals(text/plain, c.getContentType()); c = new Content(http://www.foo.com/foo.html;, http://www.foo.com/;, .getBytes(UTF8), null, p, conf); -assertEquals(text/html, c.getContentType()); +// TODO check potential Tika issue and +// revert the expected value to text/html +// see https://issues.apache.org/jira/browse/NUTCH-767 +assertEquals(text/plain, c.getContentType()); c = new Content(http://www.foo.com/;, http://www.foo.com/;, @@ -99,7 +108,10 @@ http://www.foo.com/;, .getBytes(UTF8), , p, conf); -assertEquals(MimeTypes.DEFAULT, c.getContentType()); +// TODO check that Tika returns the right value and +// revert to the default type +// see https://issues.apache.org/jira/browse/NUTCH-767 +assertEquals(text/plain, c.getContentType()); c = new Content(http://www.foo.com/;, http://www.foo.com/;,
svn commit: r885776 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/fetcher/Fetcher.java
Author: ab Date: Tue Dec 1 14:50:15 2009 New Revision: 885776 URL: http://svn.apache.org/viewvc?rev=885776view=rev Log: NUTCH-770 Timebomb for Fetcher. Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=885776r1=885775r2=885776view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Tue Dec 1 14:50:15 2009 @@ -601,6 +601,15 @@ descriptionIf true, fetcher will store content./description /property +property + namefetcher.timelimit.mins/name + value-1/value + descriptionThis is the number of minutes allocated to the fetching. + Once this value is reached, any remaining entry from the input URL list is skipped + and all active queues are emptied. The default value of -1 deactivates the time limit. + /description +/property + !-- indexer properties -- property @@ -1277,4 +1286,14 @@ /description /property +!-- solr index properties -- +property + namesolrindex.mapping.file/name + valuesolrindex-mapping.xml/value + description + Defines the name of the file that will be used in the mapping of internal + nutch field names to solr index fields as specified in the target Solr schema. + /description +/property + /configuration Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=885776r1=885775r2=885776view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Dec 1 14:50:15 2009 @@ -222,6 +222,12 @@ setEndTime(System.currentTimeMillis() - crawlDelay); } +public synchronized int emptyQueue() { + int presize = queue.size(); + queue.clear(); + return presize; +} + public int getQueueSize() { return queue.size(); } @@ -299,6 +305,7 @@ boolean byIP; long crawlDelay; long minCrawlDelay; +long timelimit = -1; Configuration conf; public FetchItemQueues(Configuration conf) { @@ -308,6 +315,7 @@ this.byIP = conf.getBoolean(fetcher.threads.per.host.by.ip, false); this.crawlDelay = (long) (conf.getFloat(fetcher.server.delay, 1.0f) * 1000); this.minCrawlDelay = (long) (conf.getFloat(fetcher.server.min.delay, 0.0f) * 1000); + this.timelimit = conf.getLong(fetcher.timelimit.mins, -1); } public int getTotalSize() { @@ -371,6 +379,29 @@ return null; } +// called only once the feeder has stopped +public synchronized int checkTimelimit() { + int count = 0; + if (System.currentTimeMillis() = timelimit timelimit != -1) { +// emptying the queues +for (String id : queues.keySet()) { + FetchItemQueue fiq = queues.get(id); + if (fiq.getQueueSize() == 0) continue; + LOG.info(* queue: + id + timelimit! ); + int deleted = fiq.emptyQueue(); + for (int i = 0; i deleted; i++) { +totalSize.decrementAndGet(); + } + count += deleted; +} +// there might also be a case where totalsize !=0 but number of queues +// == 0 +// in which case we simply force it to 0 to avoid blocking +if (totalSize.get() != 0 queues.size() == 0) totalSize.set(0); + } + return count; +} + public synchronized void dump() { for (String id : queues.keySet()) { FetchItemQueue fiq = queues.get(id); @@ -389,6 +420,7 @@ private RecordReaderText, CrawlDatum reader; private FetchItemQueues queues; private int size; +private long timelimit = -1; public QueueFeeder(RecordReaderText, CrawlDatum reader, FetchItemQueues queues, int size) { @@ -399,11 +431,29 @@ this.setName(QueueFeeder); } +public void setTimeLimit(long tl) { + timelimit = tl; +} + public void run() { boolean hasMore = true; int cnt = 0; - + int timelimitcount = 0; while (hasMore) { +if (System.currentTimeMillis() = timelimit timelimit != -1) { + // enough .. lets' simply + // read all the entries from the input without processing them + try { +Text url = new Text(); +CrawlDatum datum = new CrawlDatum(); +hasMore = reader.next(url, datum); +timelimitcount++; + } catch (IOException e) { +LOG.fatal(QueueFeeder error reading input, record + cnt, e); +return
svn commit: r885785 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/Fetcher.java
Author: ab Date: Tue Dec 1 15:15:00 2009 New Revision: 885785 URL: http://svn.apache.org/viewvc?rev=885785view=rev Log: NUTCH-769 Fetcher to skip queues for URLS getting repeated exceptions. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=885785r1=885784r2=885785view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Dec 1 15:15:00 2009 @@ -2,6 +2,9 @@ Unreleased Changes +* NUTCH-769 Fetcher to skip queues for URLS getting repeated exceptions + (Julien Nioche via ab) + * NUTCH-768 - Upgrade Nutch 1.0 to use Hadoop 0.20.1, also upgrades Xerces to version 2.9.1. (kubes) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=885785r1=885784r2=885785view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Tue Dec 1 15:15:00 2009 @@ -610,6 +610,16 @@ /description /property +property + namefetcher.max.exceptions.per.queue/name + value-1/value + descriptionThe maximum number of protocol-level exceptions (e.g. timeouts) per + host (or IP) queue. Once this value is reached, any remaining entries from this + queue are purged, effectively stopping the fetching from this host/IP. The default + value of -1 deactivates this limit. + /description +/property + !-- indexer properties -- property Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=885785r1=885784r2=885785view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Dec 1 15:15:00 2009 @@ -208,6 +208,7 @@ ListFetchItem queue = Collections.synchronizedList(new LinkedListFetchItem()); SetFetchItem inProgress = Collections.synchronizedSet(new HashSetFetchItem()); AtomicLong nextFetchTime = new AtomicLong(); +AtomicInteger exceptionCounter = new AtomicInteger(); long crawlDelay; long minCrawlDelay; int maxThreads; @@ -236,6 +237,10 @@ return inProgress.size(); } +public int incrementExceptionCounter() { + return exceptionCounter.incrementAndGet(); +} + public void finishFetchItem(FetchItem it, boolean asap) { if (it != null) { inProgress.remove(it); @@ -306,6 +311,7 @@ long crawlDelay; long minCrawlDelay; long timelimit = -1; +int maxExceptionsPerQueue = -1; Configuration conf; public FetchItemQueues(Configuration conf) { @@ -316,6 +322,7 @@ this.crawlDelay = (long) (conf.getFloat(fetcher.server.delay, 1.0f) * 1000); this.minCrawlDelay = (long) (conf.getFloat(fetcher.server.min.delay, 0.0f) * 1000); this.timelimit = conf.getLong(fetcher.timelimit.mins, -1); + this.maxExceptionsPerQueue = conf.getInt(fetcher.max.exceptions.per.queue, -1); } public int getTotalSize() { @@ -402,6 +409,36 @@ return count; } +/** + * Increment the exception counter of a queue in case of an exception e.g. + * timeout; when higher than a given threshold simply empty the queue. + * + * @param queueid + * @return number of purged items + */ +public synchronized int checkExceptionThreshold(String queueid) { + FetchItemQueue fiq = queues.get(queueid); + if (fiq == null) { +return 0; + } + if (fiq.getQueueSize() == 0) { +return 0; + } + int excCount = fiq.incrementExceptionCounter(); + if (maxExceptionsPerQueue!= -1 excCount = maxExceptionsPerQueue) { +// too many exceptions for items in this queue - purge it +int deleted = fiq.emptyQueue(); +LOG.info(* queue: + queueid + removed + deleted ++ URLs from queue because + excCount + exceptions occurred); +for (int i = 0; i deleted; i++) { + totalSize.decrementAndGet(); +} +return deleted; + } + return 0; +} + + public synchronized void dump() { for (String id : queues.keySet()) { FetchItemQueue fiq = queues.get(id); @@ -673,6 +710,8 @@ case ProtocolStatus.EXCEPTION: logError(fit.url, status.getMessage()); +int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID
svn commit: r885148 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/NutchBean.java
Author: ab Date: Sat Nov 28 21:16:42 2009 New Revision: 885148 URL: http://svn.apache.org/viewvc?rev=885148view=rev Log: NUTCH-746 NutchBeanConstructor does not close NutchBean upon contextDestroyed, causing resource leak in the container. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=885148r1=885147r2=885148view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 28 21:16:42 2009 @@ -2,6 +2,11 @@ Unreleased Changes +* NUTCH-746 NutchBeanConstructor does not close NutchBean upon contextDestroyed, + causing resource leak in the container. (Kirby Bohling via ab) + +* NUTCH-772 Upgrade Nutch to use Lucene 2.9.1 (ab) + * NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab) * NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=885148r1=885147r2=885148view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Sat Nov 28 21:16:42 2009 @@ -413,7 +413,24 @@ */ public static class NutchBeanConstructor implements ServletContextListener { -public void contextDestroyed(ServletContextEvent sce) { } +public void contextDestroyed(ServletContextEvent sce) { + final ServletContext context = sce.getServletContext(); + + LOG.info(Closing Bean); + try { +Object tmp = context.getAttribute(NutchBean.KEY); + +if (tmp instanceof NutchBean) { + NutchBean bean = (NutchBean) tmp; + bean.close(); +} else { + LOG.warn(No bean configured, or the wrong type? Potential PermGen leak, or startup problem.); +} + } + catch (final IOException ex) { +LOG.error(StringUtils.stringifyException(ex)); + } +} public void contextInitialized(ServletContextEvent sce) { final ServletContext app = sce.getServletContext();
svn commit: r885150 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/FetchedSegments.java
Author: ab Date: Sat Nov 28 21:26:51 2009 New Revision: 885150 URL: http://svn.apache.org/viewvc?rev=885150view=rev Log: NUTCH-738 Close SegmentUpdater when FetchedSegments is closed. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=885150r1=885149r2=885150view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 28 21:26:51 2009 @@ -2,6 +2,9 @@ Unreleased Changes +* NUTCH-738 Close SegmentUpdater when FetchedSegments is closed + (Martina Kich, Kirby Bohling via ab) + * NUTCH-746 NutchBeanConstructor does not close NutchBean upon contextDestroyed, causing resource leak in the container. (Kirby Bohling via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=885150r1=885149r2=885150view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Sat Nov 28 21:26:51 2009 @@ -66,9 +66,19 @@ private class SegmentUpdater extends Thread { +private volatile boolean stopRequested = false; + +@Override +public void interrupt() { + super.interrupt(); + stopRequested = true; +} + + @Override public void run() { - while (true) { + + while (!stopRequested !Thread.currentThread().isInterrupted()) { try { final FileStatus[] fstats = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); @@ -194,7 +204,9 @@ private final FileSystem fs; private final Configuration conf; private final Path segmentsDir; - private final SegmentUpdater segUpdater; + + // This must be nullable upon close, so do not declare final. + private SegmentUpdater segUpdater; private final Summarizer summarizer; /** Construct given a directory containing fetcher output. */ @@ -303,6 +315,13 @@ } public void close() throws IOException { +// Interrupt that thread to convince it to stop running. +segUpdater.interrupt(); + +// Break reference cycle, otherwise this points to segUpdater, and +// segUpdater.$0 points to this. It appeared to keep the thread from +// being GC'ed/reaped. +segUpdater = null; final IteratorSegment iterator = segments.values().iterator(); while (iterator.hasNext()) { iterator.next().close();
svn commit: r885152 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
Author: ab Date: Sat Nov 28 21:35:11 2009 New Revision: 885152 URL: http://svn.apache.org/viewvc?rev=885152view=rev Log: NUTCH-739 SolrDeleteDuplications too slow when using hadoop. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=885152r1=885151r2=885152view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 28 21:35:11 2009 @@ -2,8 +2,10 @@ Unreleased Changes +* NUTCH-739 SolrDeleteDuplications too slow when using hadoop (Dmitry Lihachev via ab) + * NUTCH-738 Close SegmentUpdater when FetchedSegments is closed - (Martina Kich, Kirby Bohling via ab) + (Martina Koch, Kirby Bohling via ab) * NUTCH-746 NutchBeanConstructor does not close NutchBean upon contextDestroyed, causing resource leak in the container. (Kirby Bohling via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=885152r1=885151r2=885152view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Sat Nov 28 21:35:11 2009 @@ -298,7 +298,6 @@ if (numDeletes 0) { updateRequest.process(solr); } - solr.optimize(); } catch (SolrServerException e) { throw new IOException(e); }
svn commit: r885156 - in /lucene/nutch/trunk: CHANGES.txt build.xml
Author: ab Date: Sat Nov 28 22:29:22 2009 New Revision: 885156 URL: http://svn.apache.org/viewvc?rev=885156view=rev Log: NUTCH-741 Job file includes multiple copies of nutch config files. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=885156r1=885155r2=885156view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Nov 28 22:29:22 2009 @@ -2,6 +2,9 @@ Unreleased Changes +* NUTCH-741 Job file includes multiple copies of nutch config files + (Kirby Bohling via ab) + * NUTCH-739 SolrDeleteDuplications too slow when using hadoop (Dmitry Lihachev via ab) * NUTCH-738 Close SegmentUpdater when FetchedSegments is closed Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=885156r1=885155r2=885156view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Sat Nov 28 22:29:22 2009 @@ -149,7 +149,12 @@ !-- == -- target name=job depends=compile jar jarfile=${build.dir}/${final.name}.job - zipfileset dir=${build.classes}/ + !-- If the build.classes has the nutch config files because the jar + command command has run, exclude them. The conf directory has + them. + -- + zipfileset dir=${build.classes} + excludes=nutch-default.xml,nutch-site.xml/ zipfileset dir=${conf.dir} excludes=*.template,hadoop*.*/ zipfileset dir=${lib.dir} prefix=lib includes=**/*.jar excludes=hadoop-*.jar/
svn commit: r884587 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: ab Date: Thu Nov 26 15:35:56 2009 New Revision: 884587 URL: http://svn.apache.org/viewvc?rev=884587view=rev Log: Fix a bug resulting from over-eager optimization in NUTCH-761. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=884587r1=884586r2=884587view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Nov 26 15:35:56 2009 @@ -42,8 +42,6 @@ private boolean additionsAllowed; private int maxInterval; private FetchSchedule schedule; - private CrawlDatum fetch = new CrawlDatum(); - private CrawlDatum old = new CrawlDatum(); public void configure(JobConf job) { retryMax = job.getInt(db.fetch.retry.max, 3); @@ -61,6 +59,9 @@ OutputCollectorText, CrawlDatum output, Reporter reporter) throws IOException { +CrawlDatum fetch = new CrawlDatum(); +CrawlDatum old = new CrawlDatum(); + boolean fetchSet = false; boolean oldSet = false; byte[] signature = null;
svn commit: r884075 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
Author: ab Date: Wed Nov 25 12:44:34 2009 New Revision: 884075 URL: http://svn.apache.org/viewvc?rev=884075view=rev Log: Change access from private to public - this fixes Crawl.java breakage. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=884075r1=884074r2=884075view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Wed Nov 25 12:44:34 2009 @@ -50,7 +50,7 @@ super(conf); } - private void indexSolr(String solrUrl, Path crawlDb, Path linkDb, + public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, ListPath segments) throws IOException { LOG.info(SolrIndexer: starting);
svn commit: r884198 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Author: ab Date: Wed Nov 25 17:10:25 2009 New Revision: 884198 URL: http://svn.apache.org/viewvc?rev=884198view=rev Log: NUTCH-773 Some minor bugs in AbstractFetchSchedule. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884198r1=884197r2=884198view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 17:10:25 2009 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab) + * NUTCH-765 - Allow Crawl class to call Either Solr or Lucene Indexer (kubes) * NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=884198r1=884197r2=884198view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Wed Nov 25 17:10:25 2009 @@ -125,7 +125,7 @@ */ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime) { -datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY); +datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY*1000); datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); return datum; } @@ -159,7 +159,9 @@ // pages with too long fetchInterval are adjusted so that they fit within // maximum fetchInterval (segment retention period). if (datum.getFetchTime() - curTime (long) maxInterval * 1000) { - datum.setFetchInterval(maxInterval * 0.9f); + if (datum.getFetchInterval() maxInterval) { +datum.setFetchInterval(maxInterval * 0.9f); + } datum.setFetchTime(curTime); } if (datum.getFetchTime() curTime) {
svn commit: r884203 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: ab Date: Wed Nov 25 17:20:33 2009 New Revision: 884203 URL: http://svn.apache.org/viewvc?rev=884203view=rev Log: NUTCH-753 Prevent new Fetcher from retrieving the robots twice. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884203r1=884202r2=884203view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 17:20:33 2009 @@ -2,7 +2,9 @@ Unreleased Changes -* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab) +* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) + +* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab) * NUTCH-765 - Allow Crawl class to call Either Solr or Lucene Indexer (kubes) Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=884203r1=884202r2=884203view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Nov 25 17:20:33 2009 @@ -185,6 +185,7 @@ String urlString = url.toString(); try { URL u = new URL(urlString); + long delay = serverDelay; if (checkRobots) { try { @@ -197,10 +198,10 @@ logger.trace(Exception checking robot rules for + url + : + e); } } + +long crawlDelay = robots.getCrawlDelay(this, u); +delay = crawlDelay 0 ? crawlDelay : serverDelay; } - - long crawlDelay = robots.getCrawlDelay(this, u); - long delay = crawlDelay 0 ? crawlDelay : serverDelay; if (checkBlocking maxCrawlDelay = 0 delay maxCrawlDelay) { // skip this page, otherwise the thread would block for too long. LOGGER.info(Skipping: + u + exceeds fetcher.max.crawl.delay, max=
svn commit: r884224 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: ab Date: Wed Nov 25 18:08:24 2009 New Revision: 884224 URL: http://svn.apache.org/viewvc?rev=884224view=rev Log: NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884224r1=884223r2=884224view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 18:08:24 2009 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab) + * NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) * NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=884224r1=884223r2=884224view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Nov 25 18:08:24 2009 @@ -64,13 +64,20 @@ boolean fetchSet = false; boolean oldSet = false; byte[] signature = null; +boolean multiple = false; // avoid deep copy when only single value exists linked.clear(); while (values.hasNext()) { CrawlDatum datum = (CrawlDatum)values.next(); + if (!multiple values.hasNext()) multiple = true; if (CrawlDatum.hasDbStatus(datum)) { if (!oldSet) { - old.set(datum); + if (multiple) { +old.set(datum); + } else { +// no need for a deep copy - this is the only value +old = datum; + } oldSet = true; } else { // always take the latest version @@ -81,7 +88,11 @@ if (CrawlDatum.hasFetchStatus(datum)) { if (!fetchSet) { - fetch.set(datum); + if (multiple) { +fetch.set(datum); + } else { +fetch = datum; + } fetchSet = true; } else { // always take the latest version @@ -92,8 +103,13 @@ switch (datum.getStatus()) {// collect other info case CrawlDatum.STATUS_LINKED: -CrawlDatum link = new CrawlDatum(); -link.set(datum); +CrawlDatum link; +if (multiple) { + link = new CrawlDatum(); + link.set(datum); +} else { + link = datum; +} linked.add(link); break; case CrawlDatum.STATUS_SIGNATURE: @@ -115,10 +131,11 @@ // still no new data - record only unchanged old data, if exists, and return if (!fetchSet) { - if (oldSet) // at this point at least old should be present + if (oldSet) {// at this point at least old should be present output.collect(key, old); - else + } else { LOG.warn(Missing fetch and old value, signature= + signature); + } return; }
svn commit: r884269 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java src/java/org/apache/nutch
Author: ab Date: Wed Nov 25 20:58:10 2009 New Revision: 884269 URL: http://svn.apache.org/viewvc?rev=884269view=rev Log: NUTCH-760 Allow field mapping from nutch to solr index. Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884269r1=884268r2=884269view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 20:58:10 2009 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab) + * NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab) * NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab) Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java?rev=884269view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java Wed Nov 25 20:58:10 2009 @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.solr; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.ObjectCache; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class SolrMappingReader { + public static Log LOG = LogFactory.getLog(SolrMappingReader.class); + + /** The property name of the parse solr index mapping location */ + private static final String SS_FILE_MAPPING = solrindex.mapping.file; + + private Configuration conf; + + private MapString, String keyMap = new HashMapString, String(); + private MapString, String copyMap = new HashMapString, String(); + private String uniqueKey = id; + + public static synchronized SolrMappingReader getInstance(Configuration conf) { +ObjectCache cache = ObjectCache.get(conf); +SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName()); +if (instance == null) { + instance = new SolrMappingReader(conf); + cache.setObject(SolrMappingReader.class.getName(), instance); +} +return instance; + } + + protected SolrMappingReader(Configuration conf) { +this.conf = conf; +parseMapping(); + } + + private void parseMapping() { +InputStream ssInputStream = null; +ssInputStream = conf.getConfResourceAsInputStream(conf.get(SS_FILE_MAPPING, solrindex-mapping.xml)); +InputSource inputSource = new InputSource(ssInputStream); +try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document document = builder.parse(inputSource); + Element rootElement = document.getDocumentElement(); + NodeList fieldList = rootElement.getElementsByTagName(field); + if (fieldList.getLength() 0) { +for (int i = 0; i fieldList.getLength(); i++) { + Element element = (Element) fieldList.item(i); + LOG.info(source: + element.getAttribute(source) + dest: + element.getAttribute(dest)); + keyMap.put(element.getAttribute
svn commit: r884277 - in /lucene/nutch/trunk: lib/ src/plugin/lib-lucene-analyzers/ src/plugin/lib-lucene-analyzers/lib/ src/plugin/summary-lucene/ src/plugin/summary-lucene/lib/ src/plugin/summary-lu
Author: ab Date: Wed Nov 25 21:17:10 2009 New Revision: 884277 URL: http://svn.apache.org/viewvc?rev=884277view=rev Log: NUTCH-772 Upgrade Nutch to use Lucene 2.9.1. Added: lucene/nutch/trunk/lib/lucene-core-2.9.1.jar (with props) lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar (with props) lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-2.4.0.jar lucene/nutch/trunk/lib/lucene-misc-2.4.0.jar lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.4.0.jar lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.4.0.jar Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java Added: lucene/nutch/trunk/lib/lucene-core-2.9.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-2.9.1.jar?rev=884277view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-2.9.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar?rev=884277view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-2.9.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar?rev=884277view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.9.1.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?rev=884277r1=884276r2=884277view=diff == --- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Wed Nov 25 21:17:10 2009 @@ -25,11 +25,11 @@ plugin id=lib-lucene-analyzers name=Lucene Analysers - version=2.4.0 + version=2.9.1 provider-name=org.apache.lucene runtime - library name=lucene-analyzers-2.4.0.jar + library name=lucene-analyzers-2.9.1.jar export name=*/ /library /runtime Added: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar?rev=884277view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.9.1.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?rev=884277r1=884276r2=884277view=diff == --- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Wed Nov 25 21:17:10 2009 @@ -25,7 +25,7 @@ library name=summary-lucene.jar export name=*/ /library - library name=lucene-highlighter-2.4.0.jar/ + library name=lucene-highlighter-2.9.1.jar/ /runtime requires Modified: lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java?rev=884277r1=884276r2=884277view=diff == --- lucene/nutch/trunk/src/plugin/summary-lucene/src/java/org/apache/nutch/summary/lucene/LuceneSummarizer.java (original) +++ lucene/nutch/trunk/src/plugin
svn commit: r884293 - /lucene/nutch/trunk/conf/solrindex-mapping.xml
Author: ab Date: Wed Nov 25 22:04:28 2009 New Revision: 884293 URL: http://svn.apache.org/viewvc?rev=884293view=rev Log: Add part of NUTCH-760. Added: lucene/nutch/trunk/conf/solrindex-mapping.xml (with props) Added: lucene/nutch/trunk/conf/solrindex-mapping.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/solrindex-mapping.xml?rev=884293view=auto == --- lucene/nutch/trunk/conf/solrindex-mapping.xml (added) +++ lucene/nutch/trunk/conf/solrindex-mapping.xml Wed Nov 25 22:04:28 2009 @@ -0,0 +1,46 @@ +?xml version=1.0 encoding=UTF-8? +!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the License); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an AS IS BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +-- + +mapping + !-- Simple mapping of fields created by Nutch IndexingFilters +to fields defined (and expected) in Solr schema.xml. + + Any fields in NutchDocument that match a name defined + in field/@source will be renamed to the corresponding + field/@dest. + Additionally, if a field name (before mapping) matches + a copyField/@source then its values will be copied to + the corresponding copyField/@dest. + + uniqueKey has the same meaning as in Solr schema.xml + and defaults to id if not defined. + -- + fields + field dest=content source=content/ + field dest=site source=site/ + field dest=title source=title/ + field dest=host source=host/ + field dest=segment source=segment/ + field dest=boost source=boost/ + field dest=digest source=digest/ + field dest=tstamp source=tstamp/ + field dest=id source=url/ + copyField source=url dest=url/ + /fields + uniqueKeyid_url/uniqueKey +/mapping Propchange: lucene/nutch/trunk/conf/solrindex-mapping.xml -- svn:eol-style = native
svn commit: r823532 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
Author: ab Date: Fri Oct 9 12:53:27 2009 New Revision: 823532 URL: http://svn.apache.org/viewvc?rev=823532view=rev Log: NUTCH-730 NPE in LinkRank if no nodes with which to create the WebGraph. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823532r1=823531r2=823532view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 12:53:27 2009 @@ -12,6 +12,9 @@ * NUTCH-707 - Generation of multiple segments in multiple runs returns only 1 segment (Michael Chen, ab) +* NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph + (Dennis Kubes via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=823532r1=823531r2=823532view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Fri Oct 9 12:53:27 2009 @@ -122,7 +122,13 @@ BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); - + +// check if there are links to process, if none, webgraph might be empty +if (numLinksLine == null || numLinksLine.length() == 0) { + fs.delete(numLinksPath, true); + throw new IOException(No links to process, is the webgraph empty?); +} + // delete temp file and convert and return the number of links as an int LOG.info(Deleting numlinks temp file); fs.delete(numLinksPath, true);
svn commit: r823540 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Author: ab Date: Fri Oct 9 13:11:15 2009 New Revision: 823540 URL: http://svn.apache.org/viewvc?rev=823540view=rev Log: NUTCH-731 Redirection of robots.txt in RobotRulesParser. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823540r1=823539r2=823540view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 13:11:15 2009 @@ -15,6 +15,8 @@ * NUTCH-730 - NPE in LinkRank if no nodes with which to create the WebGraph (Dennis Kubes via ab) +* NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=823540r1=823539r2=823540view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Fri Oct 9 13:11:15 2009 @@ -434,10 +434,29 @@ boolean cacheRule = true; if (robotRules == null) { // cache miss + URL redir = null; if (LOG.isTraceEnabled()) { LOG.trace(cache miss + url); } try { Response response = http.getResponse(new URL(url, /robots.txt), new CrawlDatum(), true); +// try one level of redirection ? +if (response.getCode() == 301 || response.getCode() == 302) { + String redirection = response.getHeader(Location); + if (redirection == null) { +// some versions of MS IIS are known to mangle this header +redirection = response.getHeader(location); + } + if (redirection != null) { +if (!redirection.startsWith(http)) { + // RFC says it should be absolute, but apparently it isn't + redir = new URL(url, redirection); +} else { + redir = new URL(redirection); +} + +response = http.getResponse(redir, new CrawlDatum(), true); + } +} if (response.getCode() == 200) // found rules: parse them robotRules = parseRules(response.getContent()); @@ -456,8 +475,12 @@ robotRules = EMPTY_RULES; } - if (cacheRule){ + if (cacheRule) { CACHE.put(host, robotRules); // cache rules for host +if (redir != null !redir.getHost().equals(host)) { + // cache also for the redirected host + CACHE.put(redir.getHost(), robotRules); +} } } return robotRules;
svn commit: r823547 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/response/RequestUtils.java src/test/org/apache/nutch/searcher/response/ src/test/org/apache/nutch/searcher/
Author: ab Date: Fri Oct 9 13:29:01 2009 New Revision: 823547 URL: http://svn.apache.org/viewvc?rev=823547view=rev Log: NUTCH-757 RequestUtils getBooleanParameter() always returns false. Added: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/ lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/TestRequestUtils.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/response/RequestUtils.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823547r1=823546r2=823547view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 13:29:01 2009 @@ -17,6 +17,9 @@ * NUTCH-731 - Redirection of robots.txt in RobotRulesParser (Julien Nioche via ab) +* NUTCH-757 - RequestUtils getBooleanParameter() always returns false + (Niall Pemberton via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/response/RequestUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/response/RequestUtils.java?rev=823547r1=823546r2=823547view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/response/RequestUtils.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/response/RequestUtils.java Fri Oct 9 13:29:01 2009 @@ -66,9 +66,9 @@ if (parameterExists(request, param)) { String value = request.getParameter(param); if (StringUtils.isNotBlank(value) - (StringUtils.equals(param, 1) - || StringUtils.equalsIgnoreCase(param, true) || StringUtils.equalsIgnoreCase( - param, yes))) { + (StringUtils.equals(value, 1) + || StringUtils.equalsIgnoreCase(value, true) || StringUtils.equalsIgnoreCase( + value, yes))) { return true; } } @@ -79,9 +79,9 @@ String param, Boolean def) { if (parameterExists(request, param)) { String value = request.getParameter(param); - return (StringUtils.isNotBlank(value) (StringUtils.equals(param, 1) -|| StringUtils.equalsIgnoreCase(param, true) || StringUtils.equalsIgnoreCase( -param, yes))); + return (StringUtils.isNotBlank(value) (StringUtils.equals(value, 1) +|| StringUtils.equalsIgnoreCase(value, true) || StringUtils.equalsIgnoreCase( +value, yes))); } return def; } Added: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/TestRequestUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/TestRequestUtils.java?rev=823547view=auto == --- lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/TestRequestUtils.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/searcher/response/TestRequestUtils.java Fri Oct 9 13:29:01 2009 @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher.response; + +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; +import java.util.HashMap; +import java.util.Map; + +import javax.servlet.http.HttpServletRequest; + +import junit.framework.TestCase; + +public class TestRequestUtils extends TestCase { + + public TestRequestUtils(String name) { +super(name); + } + + /** + * Test getBooleanParameter() - no default + */ + public void testGetBooleanParameterNoDefault() { +String param = foo; +Map parameters = new HashMap(); +HttpServletRequest request = createMockHttpServletRequest(parameters); + +assertFalse(No param, RequestUtils.getBooleanParameter(request, param)); + +parameters.put(param, 0); +assertFalse(Foo=0,RequestUtils.getBooleanParameter(request, param)); + +parameters.put(param
svn commit: r823553 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseData.java src/java/org/apache/nutch/parse/ParseText.java src/java/org/apache/nutch/protocol/Content.java
Author: ab Date: Fri Oct 9 13:54:27 2009 New Revision: 823553 URL: http://svn.apache.org/viewvc?rev=823553view=rev Log: NUTCH-754 Use GenericOptionsParser instead of FileSystem.parseArgs(). Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823553r1=823552r2=823553view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 13:54:27 2009 @@ -20,6 +20,9 @@ * NUTCH-757 - RequestUtils getBooleanParameter() always returns false (Niall Pemberton via ab) +* NUTCH-754 - Use GenericOptionsParser instead of FileSystem.parseArgs() (Julien + Nioche via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=823553r1=823552r2=823553view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Fri Oct 9 13:54:27 2009 @@ -20,9 +20,12 @@ import java.io.*; import java.util.*; +import org.apache.commons.cli.Options; import org.apache.hadoop.io.*; +import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.FileSystem; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.util.NutchConfiguration; @@ -205,11 +208,18 @@ return; } +Options opts = new Options(); Configuration conf = NutchConfiguration.create(); -FileSystem fs = FileSystem.parseArgs(argv, 0, conf); + +GenericOptionsParser parser = + new GenericOptionsParser(conf, opts, argv); + +String[] remainingArgs = parser.getRemainingArgs(); +FileSystem fs = FileSystem.get(conf); + try { - int recno = Integer.parseInt(argv[0]); - String segment = argv[1]; + int recno = Integer.parseInt(remainingArgs[0]); + String segment = remainingArgs[1]; Path file = new Path(segment, DIR_NAME); System.out.println(Reading from file: + file); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java?rev=823553r1=823552r2=823553view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java Fri Oct 9 13:54:27 2009 @@ -19,8 +19,10 @@ import java.io.*; import org.apache.hadoop.io.*; +import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; +import org.apache.commons.cli.Options; import org.apache.nutch.util.NutchConfiguration; /* The text conversion of page's content, stored using gzip compression. @@ -86,12 +88,18 @@ System.out.println(usage: + usage); return; } - +Options opts = new Options(); Configuration conf = NutchConfiguration.create(); -FileSystem fs = FileSystem.parseArgs(argv, 0, conf); + +GenericOptionsParser parser = + new GenericOptionsParser(conf, opts, argv); + +String[] remainingArgs = parser.getRemainingArgs(); + +FileSystem fs = FileSystem.get(conf); try { - int recno = Integer.parseInt(argv[0]); - String segment = argv[1]; + int recno = Integer.parseInt(remainingArgs[0]); + String segment = remainingArgs[1]; String filename = new Path(segment, ParseText.DIR_NAME).toString(); ParseText parseText = new ParseText(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=823553r1=823552r2=823553view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Fri Oct 9 13:54:27 2009 @@ -27,6 +27,7 @@ import java.util.zip.InflaterInputStream; //Hadoop imports +import org.apache.commons.cli.Options; import org.apache.hadoop.conf.Configuration; import
svn commit: r823557 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java
Author: ab Date: Fri Oct 9 14:05:05 2009 New Revision: 823557 URL: http://svn.apache.org/viewvc?rev=823557view=rev Log: NUTCH-756 CrawlDatum.set() does not reset Metadata if it is null. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823557r1=823556r2=823557view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 14:05:05 2009 @@ -23,6 +23,9 @@ * NUTCH-754 - Use GenericOptionsParser instead of FileSystem.parseArgs() (Julien Nioche via ab) +* NUTCH-756 - CrawlDatum.set() does not reset Metadata if it is null (Julien Nioche + via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=823557r1=823556r2=823557view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Fri Oct 9 14:05:05 2009 @@ -324,6 +324,8 @@ this.signature = that.signature; if (that.metaData != null) { this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make a deep copy +} else { + this.metaData = null; } }
svn commit: r823600 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java
Author: ab Date: Fri Oct 9 15:56:02 2009 New Revision: 823600 URL: http://svn.apache.org/viewvc?rev=823600view=rev Log: NUTCH-679 Fetcher2 implementing Tool. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=823600r1=823599r2=823600view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Oct 9 15:56:02 2009 @@ -26,6 +26,8 @@ * NUTCH-756 - CrawlDatum.set() does not reset Metadata if it is null (Julien Nioche via ab) +* NUTCH-679 - Fetcher2 implementing Tool (Julien Nioche via ab) + Release 1.0 - 2009-03-23 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=823600r1=823599r2=823600view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Oct 9 15:56:02 2009 @@ -35,6 +35,8 @@ import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; @@ -83,7 +85,7 @@ * * @author Andrzej Bialecki */ -public class Fetcher extends Configured implements +public class Fetcher extends Configured implements Tool, MapRunnableText, CrawlDatum, Text, NutchWritable { public static final int PERM_REFRESH_TIME = 5; @@ -972,19 +974,22 @@ /** Run the fetcher. */ public static void main(String[] args) throws Exception { +int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args); +System.exit(res); + } + + public int run(String[] args) throws Exception { String usage = Usage: Fetcher segment [-threads n] [-noParsing]; if (args.length 1) { System.err.println(usage); - System.exit(-1); + return -1; } Path segment = new Path(args[0]); -Configuration conf = NutchConfiguration.create(); - -int threads = conf.getInt(fetcher.threads.fetch, 10); +int threads = getConf().getInt(fetcher.threads.fetch, 10); boolean parsing = true; for (int i = 1; i args.length; i++) { // parse command line @@ -993,13 +998,17 @@ } else if (args[i].equals(-noParsing)) parsing = false; } -conf.setInt(fetcher.threads.fetch, threads); +getConf().setInt(fetcher.threads.fetch, threads); if (!parsing) { - conf.setBoolean(fetcher.parse, parsing); + getConf().setBoolean(fetcher.parse, parsing); +} +try { + fetch(segment, threads, parsing); + return 0; +} catch (Exception e) { + LOG.fatal(Fetcher: + StringUtils.stringifyException(e)); + return -1; } -Fetcher fetcher = new Fetcher(conf); // make a Fetcher - -fetcher.fetch(segment, threads, parsing); // run the Fetcher }
svn commit: r750037 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
Author: ab Date: Wed Mar 4 15:02:29 2009 New Revision: 750037 URL: http://svn.apache.org/viewvc?rev=750037view=rev Log: NUTCH-711 - Indexer failing after upgrade to Hadoop 0.19.1. This is a temporary fix, to be revisited later. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=750037r1=750036r2=750037view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 4 15:02:29 2009 @@ -374,6 +374,8 @@ 140. NUTCH-669 - Consolidate code for Fetcher and Fetcher2 (siren) +141. NUTCH-711 - Indexer failing after upgrade to Hadoop 0.19.1 (ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java?rev=750037r1=750036r2=750037view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Wed Mar 4 15:02:29 2009 @@ -31,6 +31,10 @@ @Override public RecordWriterText, NutchDocument getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { + +// populate JobConf with field indexing options +IndexingFilters filters = new IndexingFilters(job); + final NutchIndexWriter[] writers = NutchIndexWriterFactory.getNutchIndexWriters(job);
svn commit: r749247 - /lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
Author: ab Date: Mon Mar 2 09:11:03 2009 New Revision: 749247 URL: http://svn.apache.org/viewvc?rev=749247view=rev Log: NUTCH-419 Unavailable robots.txt kills fetch. Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=749247r1=749246r2=749247view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Mon Mar 2 09:11:03 2009 @@ -176,6 +176,10 @@ params.setDefaultMaxConnectionsPerHost(maxThreadsTotal); } +// executeMethod(HttpMethod) seems to ignore the connection timeout on the connection manager. +// set it explicitly on the HttpClient. +client.getParams().setConnectionManagerTimeout(timeout); + HostConfiguration hostConf = client.getHostConfiguration(); ArrayList headers = new ArrayList(); // Set the User Agent in the header
svn commit: r741559 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
Author: ab Date: Fri Feb 6 13:17:08 2009 New Revision: 741559 URL: http://svn.apache.org/viewvc?rev=741559view=rev Log: NUTCH-636 Httpclient plugin https doesn't work on IBM JRE. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=741559r1=741558r2=741559view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 6 13:17:08 2009 @@ -335,6 +335,9 @@ 125. NUTCH-643 - ClassCastException in PDF parser (Guillaume Smet, ab) +126. NUTCH-636 - Httpclient plugin https doesn't work on IBM JRE + (Curtis d'Entremont, ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java?rev=741559r1=741558r2=741559view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java Fri Feb 6 13:17:08 2009 @@ -44,11 +44,12 @@ */ public DummyX509TrustManager(KeyStore keystore) throws NoSuchAlgorithmException, KeyStoreException { super(); -TrustManagerFactory factory = TrustManagerFactory.getInstance(SunX509); +String algo = TrustManagerFactory.getDefaultAlgorithm(); +TrustManagerFactory factory = TrustManagerFactory.getInstance(algo); factory.init(keystore); TrustManager[] trustmanagers = factory.getTrustManagers(); if (trustmanagers.length == 0) { -throw new NoSuchAlgorithmException(SunX509 trust manager not supported); +throw new NoSuchAlgorithmException(algo + trust manager not supported); } this.standardTrustManager = (X509TrustManager)trustmanagers[0]; }
svn commit: r741558 - in /lucene/nutch/trunk: ./ src/plugin/parse-pdf/ src/plugin/parse-pdf/lib/ src/plugin/parse-pdf/sample/ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ src/plugin/parse
Author: ab Date: Fri Feb 6 13:09:07 2009 New Revision: 741558 URL: http://svn.apache.org/viewvc?rev=741558view=rev Log: NUTCH-643 ClassCastException in PDF parser, upgrade to unofficial PDFBox 0.7.4 Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.2.0-dev.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-0.2.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-LICENSE.txt (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/NOTICE.txt (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.4-dev.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/bcprov-LICENSE.txt (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/bcprov-jdk14-132.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_codec.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/jai_core.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/sample/encrypted.pdf (with props) Removed: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-pdf/build.xml lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml lucene/nutch/trunk/src/plugin/parse-pdf/sample/README.txt lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=741558r1=741557r2=741558view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 6 13:09:07 2009 @@ -333,6 +333,8 @@ 124. NUTCH-671 - JSP errors in Nutch searcher webapp (Edwin Chu via ab) +125. NUTCH-643 - ClassCastException in PDF parser (Guillaume Smet, ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/parse-pdf/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/build.xml?rev=741558r1=741557r2=741558view=diff == --- lucene/nutch/trunk/src/plugin/parse-pdf/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/build.xml Fri Feb 6 13:09:07 2009 @@ -28,6 +28,10 @@ !-- for junit test -- mkdir dir=${build.test}/data/ - copy file=sample/pdftest.pdf todir=${build.test}/data/ + copy todir=${build.test}/data +fileset dir=sample + include name=*.pdf/ +/fileset + /copy /project Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.2.0-dev.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.2.0-dev.jar?rev=741558view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.2.0-dev.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-0.2.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-0.2.0.jar?rev=741558view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-0.2.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-LICENSE.txt?rev=741558view=auto == --- lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-LICENSE.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-pdf/lib/JempBox-LICENSE.txt Fri Feb 6 13:09:07 2009 @@ -0,0 +1,25 @@ +Copyright (c) 2006-2007, www.jempbox.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of pdfbox; nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission
svn commit: r740318 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/net/URLNormalizerChecker.java
Author: ab Date: Tue Feb 3 15:12:48 2009 New Revision: 740318 URL: http://svn.apache.org/viewvc?rev=740318view=rev Log: NUTCH-279 Additions to urlnormalizer-regex (modified). Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=740318r1=740317r2=740318view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Feb 3 15:12:48 2009 @@ -328,6 +328,9 @@ 122. NUTCH-682 - SOLR indexer does not set boost on the document. (julien nioche via dogacan) + +123. NUTCH-279 - Additions to urlnormalizer-regex (Stefan Neufeind, ab) + Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=740318view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java Tue Feb 3 15:12:48 2009 @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.util.NutchConfiguration; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +/** + * Checks one given normalizer or all normalizers. + */ +public class URLNormalizerChecker { + + private Configuration conf; + + public URLNormalizerChecker(Configuration conf) { + this.conf = conf; + } + + private void checkOne(String normalizerName, String scope) throws Exception { +URLNormalizer normalizer = null; + +ExtensionPoint point = + PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID); + +if (point == null) + throw new RuntimeException(URLNormalizer.X_POINT_ID+ not found.); + +Extension[] extensions = point.getExtensions(); + +for (int i = 0; i extensions.length; i++) { + Extension extension = extensions[i]; + normalizer = (URLNormalizer)extension.getExtensionInstance(); + if (normalizer.getClass().getName().equals(normalizerName)) { +break; + } else { +normalizer = null; + } +} + +if (normalizer == null) + throw new RuntimeException(URLNormalizer +normalizerName+ not found.); + +System.out.println(Checking URLNormalizer + normalizerName); + +BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); +String line; +while ((line = in.readLine()) != null) { + String out = normalizer.normalize(line, scope); + System.out.println(out); +} + } + + private void checkAll(String scope) throws Exception { +System.out.println(Checking combination of all URLNormalizers available); + +BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); +String line; +URLNormalizers normalizers = new URLNormalizers(conf, scope); +while((line = in.readLine()) != null) { + String out = normalizers.normalize(line, scope); + System.out.println(out); +} + } + + public static void main(String[] args) throws Exception { + +String usage = Usage: URLNormalizerChecker [-normalizer normalizerName] [-scope scope] + + \n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink; + +String normalizerName = null; +String scope = URLNormalizers.SCOPE_DEFAULT; +for (int i = 0; i args.length; i++) { + if (args[i].equals(-normalizer)) { +normalizerName = args[++i]; + } else if (args[i].equals(-scope)) { +scope = args[++i]; + } else { +System.err.println(usage); +System.exit(-1
svn commit: r697878 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/util/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/pro
Author: ab Date: Mon Sep 22 09:02:40 2008 New Revision: 697878 URL: http://svn.apache.org/viewvc?rev=697878view=rev Log: NUTCH-375 - Add support for Content-Encoding: deflate. Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=697878r1=697877r2=697878view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Sep 22 09:02:40 2008 @@ -268,6 +268,9 @@ 98. NUTCH-651 - Remove bin/{start|stop}-balancer.sh from svn tracking. (dogacan) +99. NUTCH-375 - Add support for Content-Encoding: deflated +(Pascal Beis, ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=697878view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Mon Sep 22 09:02:40 2008 @@ -0,0 +1,142 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; +import java.util.zip.DeflaterOutputStream; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A collection of utility methods for working on deflated data. + */ +public class DeflateUtils { + + private static final Log LOG = LogFactory.getLog(DeflateUtils.class); + private static final int EXPECTED_COMPRESSION_RATIO = 5; + private static final int BUF_SIZE = 4096; + + /** + * Returns an inflated copy of the input array. If the deflated + * input has been truncated or corrupted, a best-effort attempt is + * made to inflate as much as possible. If no data can be extracted + * codenull/code is returned. + */ + public static final byte[] inflateBestEffort(byte[] in) { +return inflateBestEffort(in, Integer.MAX_VALUE); + } + + /** + * Returns an inflated copy of the input array, truncated to + * codesizeLimit/code bytes, if necessary. If the deflated input + * has been truncated or corrupted, a best-effort attempt is made to + * inflate as much as possible. If no data can be extracted + * codenull/code is returned. + */ + public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) { +// decompress using InflaterInputStream +ByteArrayOutputStream outStream = + new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + +// true because HTTP does not provide zlib headers +Inflater inflater = new Inflater(true); +InflaterInputStream inStream = + new InflaterInputStream(new ByteArrayInputStream(in), inflater); + +byte[] buf = new byte[BUF_SIZE]; +int written = 0; +while (true) { + try { + int size = inStream.read(buf); + if (size = 0) + break; + if ((written + size) sizeLimit) { + outStream.write(buf, 0, sizeLimit - written); + break; + } + outStream.write(buf, 0, size); + written+= size; + } catch (Exception e) { + LOG.info( Caught Exception in inflateBestEffort ); +e.printStackTrace(LogUtil.getWarnStream(LOG)); + break; + } +} +try { + outStream.close(); +} catch (IOException e) { +} + +return outStream.toByteArray(); + } + + + /** + * Returns an inflated copy
svn commit: r686900 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexSorter.java src/test/org/apache/nutch/indexer/TestIndexSorter.java
Author: ab Date: Mon Aug 18 16:56:20 2008 New Revision: 686900 URL: http://svn.apache.org/viewvc?rev=686900view=rev Log: NUTCH-641 IndexSorter incorrectly copies stored fields. Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=686900r1=686899r2=686900view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Aug 18 16:56:20 2008 @@ -256,6 +256,9 @@ 93. NUTCH-634 - Upgrade Nutch to Hadoop 0.17.1 (Michael Gottesman, Lincoln Ritter, ab) +94. NUTCH-641 - IndexSorter inorrectly copies stored fields (ab) + + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=686900r1=686899r2=686900view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon Aug 18 16:56:20 2008 @@ -191,6 +191,11 @@ return super.document(newToOld[n]); } +public Document document(int n, FieldSelector fieldSelector) +throws CorruptIndexException, IOException { + return super.document(newToOld[n], fieldSelector); +} + public boolean isDeleted(int n) { return false; } @@ -240,6 +245,10 @@ return this.score that.score ? 1 : -1 ; } } + +public String toString() { + return oldDoc= + oldDoc + ,score= + score; +} } public IndexSorter() { Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java?rev=686900view=auto == --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java Mon Aug 18 16:56:20 2008 @@ -0,0 +1,145 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import java.io.File; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileUtil; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.analysis.NutchDocumentAnalyzer; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestIndexSorter extends TestCase { + private static final Log LOG = LogFactory.getLog(TestIndexSorter.class); + + private static final String INDEX_PLAIN = index; + private static final String INDEX_SORTED = index-sorted; + private static final int NUM_DOCS = 254; + private String[] fieldNames = new String[] { + id, + url, + site, + content, + host, + anchor, + boost + }; + + Configuration conf = null; + File testDir = null; + Directory dir = null; + + + protected void setUp() throws Exception { +if (conf == null) conf = NutchConfiguration.create(); +// create test index +testDir = new File(indexSorter-test- + System.currentTimeMillis()); +if (!testDir.mkdirs()) { + throw new Exception(Can't create test dir + testDir.toString()); +} +LOG.info(Creating
svn commit: r686910 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-swf/sample/test1.txt src/plugin/parse-swf/sample/test2.txt src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser
Author: ab Date: Mon Aug 18 17:42:07 2008 New Revision: 686910 URL: http://svn.apache.org/viewvc?rev=686910view=rev Log: NUTCH-645 Parse-swf unit test failing - fix. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=686910r1=686909r2=686910view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Aug 18 17:42:07 2008 @@ -258,6 +258,8 @@ 94. NUTCH-641 - IndexSorter inorrectly copies stored fields (ab) +95. NUTCH-645 - Parse-swf unit test failing (ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt?rev=686910r1=686909r2=686910view=diff == --- lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt (original) +++ lucene/nutch/trunk/src/plugin/parse-swf/sample/test1.txt Mon Aug 18 17:42:07 2008 @@ -1,60 +1,60 @@ -Help -javascript:openCrosslinkWindow('/go/adobeacquisition') -Macromedia Home +/go/gnav_cart +/go/gnav_company +/go/gnav_devnet +/go/gnav_downloads +/go/gnav_fl_minmessage +/go/gnav_help +/go/gnav_mm_home +/go/gnav_products /go/gnav_search?loc=en_us -MovieClip -solutions /go/gnav_showcase -_sans -rollOut -To ensure the best possible Internet Experience, please download the latest version of the free +/go/gnav_solutions /go/gnav_store +/go/gnav_support +/go/gnav_your_account +Acquisition Info +Adobe Home +AppleGothic +Array +Company +Developers +Downloads +Help +Home International +LocaleManager +Macromedia Flash Player +Macromedia Home +MovieClip Products +Showcase +Solutions +Store +String +Support +TextFormat +To ensure the best possible Internet Experience, please download the latest version of the free +Verdana +_sans +active +bluePill +button +color +company devnet +downloads en_us -/go/gnav_products -AppleGothic -Macromedia Flash Player -active +home +javascript:openCrosslinkWindow('/go/adobeacquisition') +javascript:openCrosslinkWindow('/go/gnav_adobe_home') products -String -Store -downloads +rollOut rollOver -Adobe Home -/go/gnav_your_account -/go/gnav_downloads -Showcase -bluePill -/go/gnav_company -/go/gnav_support -/go/gnav_help -javascript:openCrosslinkWindow('/go/gnav_adobe_home') -home -Home -Array -/go/gnav_fl_minmessage -textColor -Developers -Support -color -support +selected showcase -button -/go/gnav_mm_home +solutions +support tabHolder -selected -Solutions -LocaleManager -Verdana -/go/gnav_devnet -Acquisition Info -/go/gnav_cart -Company -/go/gnav_solutions -company -Downloads -TextFormat +textColor Modified: lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt?rev=686910r1=686909r2=686910view=diff == --- lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt (original) +++ lucene/nutch/trunk/src/plugin/parse-swf/sample/test2.txt Mon Aug 18 17:42:07 2008 @@ -1,5 +1,5 @@ Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini -font -color TextFormat +color +font Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=686910r1=686909r2=686910view=diff == --- lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Mon Aug 18 17:42:07 2008 @@ -178,10 +178,11 @@ public String getActionText() { StringBuffer res = new StringBuffer(); -Iterator it = actionStrings.iterator(); -while (it.hasNext()) { - if (res.length() 0) res.append('\n'); - res.append(it.next()); +String[] strings = (String[])actionStrings.toArray(new String[actionStrings.size()]); +Arrays.sort(strings); +for (int i = 0; i strings.length; i++) { + if (i 0) res.append('\n'); + res.append(strings[i]); } return res.toString(); }
svn commit: r686912 - in /lucene/nutch/trunk: CHANGES.txt build.xml
Author: ab Date: Mon Aug 18 17:49:45 2008 New Revision: 686912 URL: http://svn.apache.org/viewvc?rev=686912view=rev Log: NUTCH-642 - Unit tests fail when run in non-local mode. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=686912r1=686911r2=686912view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Aug 18 17:49:45 2008 @@ -260,6 +260,8 @@ 95. NUTCH-645 - Parse-swf unit test failing (ab) +96. NUTCH-642 - Unit tests fail when run in non-local mode (ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=686912r1=686911r2=686912view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon Aug 18 17:49:45 2008 @@ -40,6 +40,7 @@ pathelement location=${test.src.dir}/ pathelement location=${plugins.classpath.dir}/ path refid=classpath/ +pathelement location=${build.dir}/${final.name}.job / /path !-- xmlcatalog definition for xslt task -- @@ -264,7 +265,7 @@ !-- == -- target name=test depends=test-core, test-plugins/ - target name=test-core depends=compile, compile-core-test + target name=test-core depends=job, compile-core-test delete dir=${test.build.data}/ mkdir dir=${test.build.data}/
svn commit: r678533 [2/2] - in /lucene/nutch/trunk: ./ conf/ lib/ lib/native/Linux-amd64-64/ lib/native/Linux-i386-32/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=678533r1=678532r2=678533view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Mon Jul 21 12:20:21 2008 @@ -28,6 +28,8 @@ import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; @@ -158,7 +160,7 @@ JobConf job = new NutchJob(getConf()); job.setBoolean(FILTER_KEY, filter); job.setBoolean(NORMALIZE_KEY, normalize); -job.addInputPath(new Path(args[0])); +FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); job.setMapOutputKeyClass(Text.class); @@ -171,7 +173,8 @@ job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); -job.setOutputPath(new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); +FileOutputFormat.setOutputPath(job, new Path(args[1], +new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); return 0; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java?rev=678533r1=678532r2=678533view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java Mon Jul 21 12:20:21 2008 @@ -18,6 +18,8 @@ import java.io.IOException; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; @@ -29,7 +31,7 @@ * A input format the reads arc files. */ public class ArcInputFormat - extends FileInputFormat { + extends FileInputFormatText, BytesWritable { /** * Returns the codeRecordReader/code for reading the arc file. @@ -38,8 +40,8 @@ * @param job The job configuration. * @param reporter The progress reporter. */ - public RecordReader getRecordReader(InputSplit split, JobConf job, -Reporter reporter) + public RecordReaderText, BytesWritable getRecordReader(InputSplit split, + JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new ArcRecordReader(job, (FileSplit)split); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=678533r1=678532r2=678533view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Mon Jul 21 12:20:21 2008 @@ -28,8 +28,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.util.ReflectionUtils; @@ -50,7 +48,7 @@ * @see http://www.grub.org/ */ public class ArcRecordReader - implements RecordReader { + implements RecordReaderText, BytesWritable { public static final Log LOG = LogFactory.getLog(ArcRecordReader.class); @@ -123,15 +121,15 @@ /** * Creates a new instance of the codeText/code object for the key. */ - public WritableComparable createKey() { -return (WritableComparable)ReflectionUtils.newInstance(Text.class, conf); + public Text createKey() { +return (Text)ReflectionUtils.newInstance(Text.class, conf); } /** * Creates a new instance of the codeBytesWritable/code object for the key */ - public Writable createValue() { -return (Writable)ReflectionUtils.newInstance(BytesWritable.class, conf); + public BytesWritable createValue() { +return (BytesWritable)ReflectionUtils.newInstance(BytesWritable.class, conf); } /** @@ -175,7 +173,7 @@ * * @throws
svn commit: r669300 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Author: ab Date: Wed Jun 18 14:34:17 2008 New Revision: 669300 URL: http://svn.apache.org/viewvc?rev=669300view=rev Log: Avoid NPE when pocessing empty / corrupted indexes. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=669300r1=669299r2=669300view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Jun 18 14:34:17 2008 @@ -220,7 +220,7 @@ } public void close() throws IOException { -indexReader.close(); +if (indexReader != null) indexReader.close(); } public Text createKey() {
svn commit: r638779 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/ap
Author: ab Date: Wed Mar 19 03:34:14 2008 New Revision: 638779 URL: http://svn.apache.org/viewvc?rev=638779view=rev Log: NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=638779r1=638778r2=638779view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 19 03:34:14 2008 @@ -239,6 +239,9 @@ 87. NUTCH-223 - Crawl.java uses Integer.MAX_VALUE (Jeff Ritchie via ab) +88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API. +(Emmanuel Joke, dogacan, ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=638779r1=638778r2=638779view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Mar 19 03:34:14 2008 @@ -28,8 +28,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; @@ -40,7 +39,7 @@ * This class takes the output of the fetcher and updates the * crawldb accordingly. */ -public class CrawlDb extends ToolBase { +public class CrawlDb extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(CrawlDb.class); public static final String CRAWLDB_ADDITIONS_ALLOWED = db.update.additions.allowed; @@ -48,11 +47,8 @@ public static final String CURRENT_NAME = current; public static final String LOCK_NAME = .locked; - - public CrawlDb() { - - } + public CrawlDb() {} public CrawlDb(Configuration conf) { setConf(conf); @@ -150,7 +146,7 @@ } public static void main(String[] args) throws Exception { -int res = new CrawlDb().doMain(NutchConfiguration.create(), args); +int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args); System.exit(res); } @@ -182,8 +178,8 @@ } else if (args[i].equals(-noAdditions)) { additionsAllowed = false; } else if (args[i].equals(-dir)) { -Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); -dirs.addAll(Arrays.asList(paths)); +FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); +dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else { dirs.add(new Path(args[i])); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=638779r1=638778r2=638779view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Mar 19 03:34:14 2008 @@ -28,10 +28,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.*; -import
svn commit: r638782 - in /lucene/nutch/trunk: ./ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/ src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnorma
Author: ab Date: Wed Mar 19 03:45:55 2008 New Revision: 638782 URL: http://svn.apache.org/viewvc?rev=638782view=rev Log: NUTCH-620 BasicURLNormalizer should collapse runs of slashes with a single slash. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=638782r1=638781r2=638782view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 19 03:45:55 2008 @@ -242,6 +242,9 @@ 88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API. (Emmanuel Joke, dogacan, ab) +89. NUTCH-620 - BasicURLNormalizer should collapse runs of slashes with a +single slash. (Mark DeSpain via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?rev=638782r1=638781r2=638782view=diff == --- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Wed Mar 19 03:45:55 2008 @@ -43,6 +43,7 @@ }; private Rule relativePathRule = null; private Rule leadingRelativePathRule = null; +private Rule adjacentSlashRule = null; private Configuration conf; @@ -64,6 +65,13 @@ compiler.compile(^(/\\.\\./)+, Perl5Compiler.READ_ONLY_MASK); leadingRelativePathRule.substitution = new Perl5Substitution(/); +// this pattern tries to find spots like xx//yy in the url, +// which could be replaced by a / +adjacentSlashRule = new Rule(); +adjacentSlashRule.pattern = (Perl5Pattern) + compiler.compile(/{2,}, Perl5Compiler.READ_ONLY_MASK); +adjacentSlashRule.substitution = new Perl5Substitution(/); + } catch (MalformedPatternException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); throw new RuntimeException(e); @@ -163,6 +171,13 @@ fileWorkCopy = Util.substitute (matcher, leadingRelativePathRule.pattern, leadingRelativePathRule.substitution, fileWorkCopy, 1); + + +// collapse adjacent slashes with / +fileWorkCopy = Util.substitute +(matcher, adjacentSlashRule.pattern, + adjacentSlashRule.substitution, fileWorkCopy, 1); + newLen = fileWorkCopy.length(); } Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=638782r1=638781r2=638782view=diff == --- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Wed Mar 19 03:45:55 2008 @@ -90,12 +90,22 @@ http://foo.com/foo.html; ); normalizeTest(http://foo.com/foo.foo/../foo.html;, http://foo.com/foo.html; ); +normalizeTest(http://foo.com//aa/bb/foo.html;, + http://foo.com/aa/bb/foo.html; ); +normalizeTest(http://foo.com/aa//bb/foo.html;, + http://foo.com/aa/bb/foo.html; ); +normalizeTest(http://foo.com/aa/bb//foo.html;, + http://foo.com/aa/bb/foo.html; ); +normalizeTest(http://foo.com//aa//bb//foo.html;, + http://foo.com/aa/bb/foo.html; ); +normalizeTest(http://foo.comaabbfoo.html;, + http://foo.com/aa/bb/foo.html; ); } private void normalizeTest(String weird, String normal) throws Exception { assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); } - + public static void main(String[] args) throws Exception { new TestBasicURLNormalizer(test).testNormalizer(); }
svn commit: r637837 - /lucene/nutch/trunk/build.xml
Author: ab Date: Mon Mar 17 04:05:11 2008 New Revision: 637837 URL: http://svn.apache.org/viewvc?rev=637837view=rev Log: Don't add Hadoop config files to Nutch job file. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=637837r1=637836r2=637837view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon Mar 17 04:05:11 2008 @@ -149,7 +149,7 @@ target name=job depends=compile jar jarfile=${build.dir}/${final.name}.job zipfileset dir=${build.classes}/ - zipfileset dir=${conf.dir} excludes=*.template/ + zipfileset dir=${conf.dir} excludes=*.template,hadoop*.*/ zipfileset dir=${lib.dir} prefix=lib includes=**/*.jar excludes=hadoop-*.jar/ zipfileset dir=${build.plugins} prefix=plugins/
svn commit: r637861 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
Author: ab Date: Mon Mar 17 05:42:54 2008 New Revision: 637861 URL: http://svn.apache.org/viewvc?rev=637861view=rev Log: NUTCH-616 Reset Fetch Retry counter when fetch is successful. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637861r1=637860r2=637861view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:42:54 2008 @@ -232,6 +232,9 @@ 84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval. Guard against reprUrl being null. (Emmanuel Joke, ab) +85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel +Joke, ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=637861r1=637860r2=637861view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Mar 17 05:42:54 2008 @@ -33,8 +33,8 @@ public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule { private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class); - private int defaultInterval; - private int maxInterval; + protected int defaultInterval; + protected int maxInterval; public AbstractFetchSchedule() { super(null); @@ -69,12 +69,22 @@ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { datum.setFetchTime(System.currentTimeMillis()); datum.setFetchInterval(defaultInterval); +datum.setRetriesSinceFetch(0); return datum; } - public abstract CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, + /** + * Sets the codefetchInterval/code and codefetchTime/code on a + * successfully fetched page. NOTE: this implementation resets the + * retry counter - extending classes should call super.setFetchSchedule() to + * preserve this behavior. + */ + public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, - long fetchTime, long modifiedTime, int state); + long fetchTime, long modifiedTime, int state) { +datum.setRetriesSinceFetch(0); +return datum; + } /** * This method specifies how to schedule refetching of pages @@ -101,7 +111,8 @@ /** * This method adjusts the fetch schedule if fetching needs to be * re-tried due to transient errors. The default implementation - * sets the next fetch time 1 day in the future. + * sets the next fetch time 1 day in the future and increases + * the retry counter. * @param url URL of the page * @param datum page information * @param prevFetchTime previous fetch time @@ -115,6 +126,7 @@ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime) { datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY); +datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); return datum; } @@ -122,7 +134,7 @@ * This method return the last fetch time of the CrawlDatum * @return the date as a long. */ - public long calculateLastFetchTime(CrawlDatum datum){ + public long calculateLastFetchTime(CrawlDatum datum) { return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; } @@ -157,8 +169,8 @@ } /** - * This method resets fetchTime, fetchInterval, modifiedTime and - * page signature, so that it forces refetching. + * This method resets fetchTime, fetchInterval, modifiedTime, + * retriesSinceFetch and page signature, so that it forces refetching. * @param url URL of the page * @param datum datum instance * @param asap if true, force refetch as soon as possible - this sets @@ -170,6 +182,7 @@ if (datum.getFetchInterval() maxInterval) datum.setFetchInterval(maxInterval * 0.9f); datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED
svn commit: r637858 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/fetcher/Fetcher2.java src/java/org/apache/nutch/parse/ParseOutputForm
Author: ab Date: Mon Mar 17 05:33:56 2008 New Revision: 637858 URL: http://svn.apache.org/viewvc?rev=637858view=rev Log: NUTCH-615 Redirected URL-s fetched without setting fetchInterval. Guard against reprUrl being null. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637858r1=637857r2=637858view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:33:56 2008 @@ -229,6 +229,9 @@ 83. NUTCH-126 - Fetching https does not work with a proxy (Fritz Elfert via ab) +84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval. +Guard against reprUrl being null. (Emmanuel Joke, ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637858r1=637857r2=637858view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:33:56 2008 @@ -282,8 +282,10 @@ return url; } else { CrawlDatum newDatum = new CrawlDatum(); - newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, - new Text(reprUrl)); + if (reprUrl != null) { +newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, +new Text(reprUrl)); + } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug( - + redirType + redirect to + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637858r1=637857r2=637858view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:33:56 2008 @@ -549,9 +549,12 @@ refreshTime Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { -CrawlDatum newDatum = new CrawlDatum(); -newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, -new Text(reprUrl)); +CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, +fit.datum.getFetchInterval(), fit.datum.getScore()); +if (reprUrl != null) { + newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, + new Text(reprUrl)); +} fit = FetchItem.create(redirUrl, newDatum, byIP); if (fit != null) { FetchItemQueue fiq = @@ -582,14 +585,22 @@ handleRedirect(fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); -CrawlDatum newDatum = new CrawlDatum(); -newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, -new Text(reprUrl)); -fit = FetchItem.create(redirUrl, newDatum, byIP); -if (fit != null) { - FetchItemQueue fiq = -fetchQueues.getFetchItemQueue(fit.queueID); - fiq.addInProgressFetchItem(fit); +if (redirUrl != null) { + CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, + fit.datum.getFetchInterval(), fit.datum.getScore()); + if (reprUrl != null) { +newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, +new Text(reprUrl)); + } + fit = FetchItem.create(redirUrl, newDatum, byIP); + if (fit != null) { +FetchItemQueue fiq = + fetchQueues.getFetchItemQueue(fit.queueID); +fiq.addInProgressFetchItem(fit); + } else { +// stop redirecting +redirecting = false; + } } else
svn commit: r637960 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar src/p
Author: ab Date: Mon Mar 17 09:23:56 2008 New Revision: 637960 URL: http://svn.apache.org/viewvc?rev=637960view=rev Log: NUTCH-220 Upgrade to PDFBox 0.7.3. Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar (with props) lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar (with props) Removed: lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.2-log4j.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637960r1=637959r2=637960view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 09:23:56 2008 @@ -235,6 +235,8 @@ 85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel Joke, ab) +86. NUTCH-220 - Upgrade to PDFBox 0.7.3 (ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar?rev=637960view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-0.1.0-dev.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar?rev=637960view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-pdf/lib/PDFBox-0.7.3.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml?rev=637960r1=637959r2=637960view=diff == --- lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/plugin.xml Mon Mar 17 09:23:56 2008 @@ -26,7 +26,8 @@ library name=parse-pdf.jar export name=*/ /library - library name=PDFBox-0.7.2-log4j.jar/ + library name=PDFBox-0.7.3.jar/ + library name=FontBox-0.1.0-dev.jar/ /runtime requires
svn commit: r638092 - /lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt
Author: ab Date: Mon Mar 17 15:08:23 2008 New Revision: 638092 URL: http://svn.apache.org/viewvc?rev=638092view=rev Log: Add missing license file. Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt (with props) Added: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt?rev=638092view=auto == --- lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt (added) +++ lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt Mon Mar 17 15:08:23 2008 @@ -0,0 +1,25 @@ +Copyright (c) 2003-2005, www.fontbox.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of fontbox; nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file Propchange: lucene/nutch/trunk/src/plugin/parse-pdf/lib/FontBox-LICENSE.txt -- svn:eol-style = native
svn commit: r637105 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/searcher/FetchedSegments.java
Author: ab Date: Fri Mar 14 07:12:31 2008 New Revision: 637105 URL: http://svn.apache.org/viewvc?rev=637105view=rev Log: NUTCH-613 Empty summaries and cached pages. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637105r1=637104r2=637105view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 07:12:31 2008 @@ -218,6 +218,9 @@ 78. NUTCH-567 - Proper (?) handling of URIs in TagSoup. TagSoup library is updated to 1.2 version. (dogacan) +79. NUTCH-613 - Empty summaries and cached pages (kubes via ab) + + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=637105r1=637104r2=637105view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Fri Mar 14 07:12:31 2008 @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Iterator; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.nutch.protocol.*; @@ -218,7 +219,11 @@ } private Text getUrl(HitDetails details) { -return new Text(details.getValue(url)); +String url = details.getValue(orig); +if (StringUtils.isBlank(url)) { + url = details.getValue(url); +} +return new Text(url); } public void close() throws IOException {
svn commit: r637114 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Crawl.java src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Fri Mar 14 07:33:53 2008 New Revision: 637114 URL: http://svn.apache.org/viewvc?rev=637114view=rev Log: NUTCH-612 URL filtering was disabled when invoking Generator from Crawl. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637114r1=637113r2=637114view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 07:33:53 2008 @@ -220,6 +220,9 @@ 79. NUTCH-613 - Empty summaries and cached pages (kubes via ab) +80. NUTCH-612 - URL filtering was disabled in Generator when invoked +from Crawl (Susam Pal via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=637114r1=637113r2=637114view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 14 07:33:53 2008 @@ -117,7 +117,7 @@ int i; for (i = 0; i depth; i++) { // generate new segment Path segment = generator.generate(crawlDb, segments, -1, topN, System - .currentTimeMillis(), false, false); + .currentTimeMillis()); if (segment == null) { LOG.info(Stopping at depth= + i + - no more URLs to fetch.); break; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=637114r1=637113r2=637114view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Mar 14 07:33:53 2008 @@ -371,11 +371,28 @@ setConf(conf); } - /** Generate fetchlists in a segment. */ - public Path generate(Path dbDir, Path segments) -throws IOException { -return generate(dbDir, segments, -1, Long.MAX_VALUE, System -.currentTimeMillis(), true, false); + /** + * Generate fetchlists in a segment. Whether to filter URLs or not is + * read from the crawl.generate.filter property in the configuration + * files. If the property is not found, the URLs are filtered. + * + * @param dbDir Crawl database directory + * @param segments Segments directory + * @param numLists Number of reduce tasks + * @param topN Number of top URLs to be selected + * @param curTime Current time in milliseconds + * + * @return Path to generated segment or null if no entries were + * selected + * + * @throws IOException When an I/O error occurs + */ + public Path generate(Path dbDir, Path segments, int numLists, + long topN, long curTime) throws IOException { + +JobConf job = new NutchJob(getConf()); +boolean filter = job.getBoolean(CRAWL_GENERATE_FILTER, true); +return generate(dbDir, segments, numLists, topN, curTime, filter, false); } /**
svn commit: r637122 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Crawl.java
Author: ab Date: Fri Mar 14 07:54:31 2008 New Revision: 637122 URL: http://svn.apache.org/viewvc?rev=637122view=rev Log: NUTCH-601 Recrawling in existing crawl directory. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637122r1=637121r2=637122view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 07:54:31 2008 @@ -223,6 +223,8 @@ 80. NUTCH-612 - URL filtering was disabled in Generator when invoked from Crawl (Susam Pal via ab) +81. NUTCH-601 - Recrawling on existing crawl directory (Susam Pal via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=637122r1=637121r2=637122view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 14 07:54:31 2008 @@ -82,9 +82,6 @@ } FileSystem fs = FileSystem.get(job); -if (fs.exists(dir)) { - throw new RuntimeException(dir + already exists.); -} if (LOG.isInfoEnabled()) { LOG.info(crawl started in: + dir); @@ -130,6 +127,18 @@ } if (i 0) { linkDbTool.invert(linkDb, segments, true, true, false); // invert links + + // Delete old indexes + if (fs.exists(indexes)) { +LOG.info(Deleting old indexes: + indexes); +fs.delete(indexes); + } + + // Delete old index + if (fs.exists(index)) { +LOG.info(Deleting old merged index: + index); +fs.delete(index); + } // index, dedup merge indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments, HadoopFSUtil.getPassAllFilter()));
svn commit: r637308 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
Author: ab Date: Fri Mar 14 17:17:07 2008 New Revision: 637308 URL: http://svn.apache.org/viewvc?rev=637308view=rev Log: NUTCH-126 Fetching via https doesn't work with a proxy. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637308r1=637307r2=637308view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 17:17:07 2008 @@ -227,6 +227,8 @@ 82. NUTCH-575 - NPE in OpenSearchServlet (John H. Lee via ab) +83. NUTCH-126 - Fetching https does not work with a proxy (Fritz Elfert via ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=637308r1=637307r2=637308view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java Fri Mar 14 17:17:07 2008 @@ -34,14 +34,14 @@ import org.apache.commons.httpclient.HttpClientError; import org.apache.commons.httpclient.params.HttpConnectionParams; import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory; -import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; +import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; -public class DummySSLProtocolSocketFactory implements ProtocolSocketFactory { +public class DummySSLProtocolSocketFactory implements SecureProtocolSocketFactory { /** Log object for this class. */ private static final Log LOG = LogFactory.getLog(DummySSLProtocolSocketFactory.class);
svn commit: r604956 - in /lucene/nutch/trunk: CHANGES.txt bin/nutch
Author: ab Date: Mon Dec 17 10:22:17 2007 New Revision: 604956 URL: http://svn.apache.org/viewvc?rev=604956view=rev Log: NUTCH-586 - Add option to run compiled classes without job file. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=604956r1=604955r2=604956view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Dec 17 10:22:17 2007 @@ -176,6 +176,9 @@ 60. NUTCH-581 - DistributedSearch does not update search servers added to search-servers.txt on the fly. (Rohan Mehta via kubes) +61. NUTCH-586 - Add option to run compiled classes without job file +(enis via ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?rev=604956r1=604955r2=604956view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Mon Dec 17 10:22:17 2007 @@ -30,7 +30,7 @@ # if no args specified, show usage if [ $# = 0 ]; then - echo Usage: nutch COMMAND + echo Usage: nutch [-core] COMMAND echo where COMMAND is one of: echo crawl one-step crawler for intranets echo readdbread / dump crawl db @@ -56,9 +56,20 @@ echo or echo CLASSNAME run the class named CLASSNAME echo Most commands print help when invoked w/o parameters. + echo + echo Expert: -core option is for developers only. It avoids building the job jar, + echo instead it simply includes classes compiled with ant compile-core. + echo NOTE: this works only for jobs executed in 'local' mode exit 1 fi +IS_CORE=0 +#check for -core option +if [ $1 == -core ] ; then + IS_CORE=1 + shift +fi + # get arguments COMMAND=$1 shift @@ -99,17 +110,23 @@ if [ -d $NUTCH_HOME/build/plugins ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi -for f in $NUTCH_HOME/build/nutch-*.job; do - CLASSPATH=${CLASSPATH}:$f; -done if [ -d $NUTCH_HOME/build/test/classes ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi -# for releases, add Nutch job to CLASSPATH -for f in $NUTCH_HOME/nutch-*.job; do - CLASSPATH=${CLASSPATH}:$f; -done +if [ $IS_CORE == 0 ] +then + for f in $NUTCH_HOME/build/nutch-*.job; do +CLASSPATH=${CLASSPATH}:$f; + done + + # for releases, add Nutch job to CLASSPATH + for f in $NUTCH_HOME/nutch-*.job; do +CLASSPATH=${CLASSPATH}:$f; + done +else + CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes +fi # add plugins to classpath if [ -d $NUTCH_HOME/plugins ]; then
svn commit: r577018 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Tue Sep 18 12:07:39 2007 New Revision: 577018 URL: http://svn.apache.org/viewvc?rev=577018view=rev Log: NUTCH-554 - Generator throws IOException on invalid urls. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=577018r1=577017r2=577018view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Sep 18 12:07:39 2007 @@ -133,6 +133,9 @@ 45. NUTCH-546 - file URL are filtered out by the crawler. (dogacan) +46. NUTCH-554 - Generator throws IOException on invalid urls. +(Brian Whitman via ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=577018r1=577017r2=577018view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Sep 18 12:07:39 2007 @@ -184,7 +184,13 @@ Text url = entry.url; if (maxPerHost 0) { // are we counting hosts? - URL u = new URL(url.toString()); + URL u = null; + try { +u = new URL(url.toString()); + } catch (MalformedURLException e) { +LOG.info(Bad protocol in url: + url.toString()); +continue; + } String host = u.getHost(); if (host == null) { // unknown host, skip
svn commit: r575360 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: ab Date: Thu Sep 13 09:23:52 2007 New Revision: 575360 URL: http://svn.apache.org/viewvc?rev=575360view=rev Log: Document a property. Spotted by Emmanuel Joke. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=575360r1=575359r2=575360view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Sep 13 09:23:52 2007 @@ -487,6 +487,15 @@ /property property + namefetcher.server.min.delay/name + value0.0/value + descriptionThe minimum number of seconds the fetcher will delay between + successive requests to the same server. This value is applicable ONLY + if fetcher.threads.per.host is greater than 1 (i.e. the host blocking + is turned off)./description +/property + +property namefetcher.max.crawl.delay/name value30/value description
svn commit: r549638 - in /lucene/nutch/trunk: ./ lib/ lib/native/Linux-i386-32/ src/java/org/apache/nutch/indexer/ src/plugin/lib-lucene-analyzers/lib/ src/plugin/summary-lucene/lib/
Author: ab Date: Thu Jun 21 15:52:02 2007 New Revision: 549638 URL: http://svn.apache.org/viewvc?view=revrev=549638 Log: Upgrade to Lucene 2.2.0 and Hadoop 0.12.3. Added: lucene/nutch/trunk/lib/hadoop-0.12.3-core.jar (with props) lucene/nutch/trunk/lib/lucene-core-2.2.0.jar (with props) lucene/nutch/trunk/lib/lucene-misc-2.2.0.jar (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.2.0.jar (with props) lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.2.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.12.2-core.jar lucene/nutch/trunk/lib/lucene-core-2.1.0.jar lucene/nutch/trunk/lib/lucene-misc-2.1.0.jar lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.1.0.jar lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.1.0.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=549638r1=549637r2=549638 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jun 21 15:52:02 2007 @@ -57,6 +57,8 @@ 17. NUTCH-471 - Fix synchronization in NutchBean creation. (Enis Soztutar via dogacan) +18. Upgrade to Lucene 2.2.0 and Hadoop 0.12.3. (ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/lib/hadoop-0.12.3-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.12.3-core.jar?view=autorev=549638 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.12.3-core.jar -- svn:executable = * Propchange: lucene/nutch/trunk/lib/hadoop-0.12.3-core.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-core-2.2.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-2.2.0.jar?view=autorev=549638 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-2.2.0.jar -- svn:executable = * Propchange: lucene/nutch/trunk/lib/lucene-core-2.2.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-2.2.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-2.2.0.jar?view=autorev=549638 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-2.2.0.jar -- svn:executable = * Propchange: lucene/nutch/trunk/lib/lucene-misc-2.2.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=diffrev=549638r1=549637r2=549638 == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java?view=diffrev=549638r1=549637r2=549638 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java Thu Jun 21 15:52:02 2007 @@ -219,8 +219,8 @@ out = fs.create(path); } -public void flushBuffer(byte[] b, int size) throws IOException { - out.write(b, 0, size); +public void flushBuffer(byte[] b, int offset, int size) throws IOException { + out.write(b, offset, size); } public void close() throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?view=diffrev=549638r1=549637r2=549638
svn commit: r543264 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/
Author: ab Date: Thu May 31 14:23:45 2007 New Revision: 543264 URL: http://svn.apache.org/viewvc?view=revrev=543264 Log: NUTCH-392 - OutputFormat implementations should pass on Progressable. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=543264r1=543263r2=543264 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu May 31 14:23:45 2007 @@ -26,6 +26,10 @@ 9. NUTCH-61 - Support for adaptive re-fetch interval and detection of unmodified content. (ab) + +10. NUTCH-392 - OutputFormat implementations should pass on Progressable. +(cutting via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diffrev=543264r1=543263r2=543264 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Thu May 31 14:23:45 2007 @@ -28,6 +28,7 @@ import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; @@ -58,7 +59,8 @@ new Path(new Path(job.getOutputPath(), Content.DIR_NAME), name); final MapFile.Writer fetchOut = - new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class); + new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class, + CompressionType.NONE, progress); return new RecordWriter() { private MapFile.Writer contentOut; @@ -67,11 +69,12 @@ { if (Fetcher.isStoringContent(job)) { contentOut = new MapFile.Writer(job, fs, content.toString(), -Text.class, Content.class); +Text.class, Content.class, +CompressionType.NONE, progress); } if (Fetcher.isParsing(job)) { -parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, null); +parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, progress); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diffrev=543264r1=543263r2=543264 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu May 31 14:23:45 2007 @@ -60,7 +60,7 @@ public static class OutputFormat extends org.apache.hadoop.mapred.OutputFormatBase { public RecordWriter getRecordWriter(final FileSystem fs, JobConf job, -String name, Progressable progress) throws IOException { +String name, final Progressable progress) throws IOException { final Path perm = new Path(job.getOutputPath(), name); final Path temp = job.getLocalPath(index/_+Integer.toString(new Random().nextInt())); @@ -95,6 +95,7 @@ ( + doc.get(lang) + )); } writer.addDocument(doc, analyzer); +progress.progress(); } public void close(final Reporter reporter) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diffrev=543264r1=543263r2=543264 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu May 31 14:23:45 2007 @@ -68,13 +68,16 @@ new Path(new Path(job.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), name
svn commit: r536623 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java
Author: ab Date: Wed May 9 12:15:45 2007 New Revision: 536623 URL: http://svn.apache.org/viewvc?view=revrev=536623 Log: Add missing javadoc and license header. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java?view=diffrev=536623r1=536622r2=536623 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java Wed May 9 12:15:45 2007 @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.nutch.parse; import java.util.HashMap; @@ -12,6 +29,11 @@ /** * A utility class that stores result of a parse. Internally * a ParseResult stores lt;[EMAIL PROTECTED] Text}, [EMAIL PROTECTED] Parse}gt; pairs. + * pParsers may return multiple results, which correspond to parts + * or other associated documents related to the original URL./p + * pThere will be usually one parse result that corresponds directly + * to the original URL, and possibly many (or none) results that correspond + * to derived URLs (or sub-URLs). */ public class ParseResult implements IterableMap.EntryText, Parse { private MapText, Parse parseMap; @@ -19,45 +41,94 @@ public static final Log LOG = LogFactory.getLog(ParseResult.class); + /** + * Create a container for parse results. + * @param originalUrl the original url from which all parse results + * have been obtained. + */ public ParseResult(String originalUrl) { parseMap = new HashMapText, Parse(); this.originalUrl = originalUrl; } + /** + * Convenience method for obtaining [EMAIL PROTECTED] ParseResult} from a single + * [EMAIL PROTECTED] Parse} output. + * @param url canonical url + * @param parse single parse output + * @return result containing the single parse output + */ public static ParseResult createParseResult(String url, Parse parse) { ParseResult parseResult = new ParseResult(url); parseResult.put(new Text(url), new ParseText(parse.getText()), parse.getData()); return parseResult; } + /** + * Checks whether the result is empty. + * @return + */ public boolean isEmpty() { return parseMap.isEmpty(); } + /** + * Return the number of parse outputs (both successful and failed) + */ public int size() { return parseMap.size(); } + /** + * Retrieve a single parse output. + * @param key sub-url under which the parse output is stored. + * @return parse output corresponding to this sub-url, or null. + */ public Parse get(String key) { return get(new Text(key)); } + /** + * Retrieve a single parse output. + * @param key sub-url under which the parse output is stored. + * @return parse output corresponding to this sub-url, or null. + */ public Parse get(Text key) { return parseMap.get(key); } + /** + * Store a result of parsing. + * @param key URL or sub-url of this parse result + * @param text plain text result + * @param data corresponding parse metadata of this result + */ public void put(Text key, ParseText text, ParseData data) { put(key.toString(), text, data); } + /** + * Store a result of parsing. + * @param key URL or sub-url of this parse result + * @param text plain text result + * @param data corresponding parse metadata of this result + */ public void put(String key, ParseText text, ParseData data) { parseMap.put(new Text(key), new ParseImpl(text, data, key.equals(originalUrl))); } + /** + * Iterate over all entries in the lt;url, Parsegt; map. + */ public IteratorEntryText, Parse iterator() { return parseMap.entrySet().iterator(); } + /** + * Remove all results where status is not successful (as determined + * by [EMAIL PROTECTED] ParseStatus#isSuccess()}). Note that effects of this operation + * cannot be reversed. + */ public void filter
svn commit: r536629 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/Indexer.java src/java/org/apache/nutch/indexer/IndexingFilter.java src/java/org/apache/nutch/indexer/Indexin
Author: ab Date: Wed May 9 12:36:54 2007 New Revision: 536629 URL: http://svn.apache.org/viewvc?view=revrev=536629 Log: NUTCH-393 - Indexer should handle null documents returned by filters. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=536629r1=536628r2=536629 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed May 9 12:36:54 2007 @@ -7,6 +7,9 @@ 2. NUTCH-443 - Allow parsers to return multiple Parse objects. (Dogacan Guney et al, via ab) + 3. NUTCH-393 - Indexer should handle null documents returned by filters. +(Eelco Lempsink via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diffrev=536629r1=536628r2=536629 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed May 9 12:36:54 2007 @@ -218,6 +218,9 @@ return; } +// skip documents discarded by indexing filters +if (doc == null) return; + float boost = 1.0f; // run scoring filters try { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?view=diffrev=536629r1=536628r2=536629 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Wed May 9 12:36:54 2007 @@ -41,14 +41,15 @@ /** * Adds fields or otherwise modifies the document that will be indexed for a - * parse. + * parse. Unwanted documents can be removed from indexing by returning a null value. * * @param doc document instance for collecting fields * @param parse parse data instance * @param url page url * @param datum crawl datum for the page * @param inlinks page inlinks - * @return modified (or a new) document instance + * @return modified (or a new) document instance, or null (meaning the document + * should be discarded) * @throws IndexingException */ Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diffrev=536629r1=536628r2=536629 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Wed May 9 12:36:54 2007 @@ -108,6 +108,8 @@ Inlinks inlinks) throws IndexingException { for (int i = 0; i this.indexingFilters.length; i++) { doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks); + // break the loop if an indexing filter discards the doc + if (doc == null) return null; } return doc;
svn commit: r532088 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher2.java
Author: ab Date: Tue Apr 24 14:32:51 2007 New Revision: 532088 URL: http://svn.apache.org/viewvc?view=revrev=532088 Log: NUTCH-474 - Fix crawlDelay and blocking checks. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=532088r1=532087r2=532088 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 24 14:32:51 2007 @@ -2,6 +2,9 @@ Unreleased changes (1.0-dev) + 1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab) + + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diffrev=532088r1=532087r2=532088 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Apr 24 14:32:51 2007 @@ -236,7 +236,7 @@ public FetchItem getFetchItem() { if (inProgress.size() = maxThreads) return null; long now = System.currentTimeMillis(); - long last = endTime.get() + (maxThreads 1 ? crawlDelay : minCrawlDelay); + long last = endTime.get() + (maxThreads 1 ? minCrawlDelay : crawlDelay); if (last now) return null; FetchItem it = null; if (queue.size() == 0) return null; @@ -771,8 +771,8 @@ feeder.start(); // set non-blocking no-robots mode for HTTP protocol plugins. -getConf().setBoolean(http.plugin.check.blocking, false); -getConf().setBoolean(http.plugin.check.robots, false); +getConf().setBoolean(Protocol.CHECK_BLOCKING, false); +getConf().setBoolean(Protocol.CHECK_ROBOTS, false); for (int i = 0; i threadCount; i++) { // spawn threads new FetcherThread(getConf()).start();
svn commit: r532105 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Author: ab Date: Tue Apr 24 15:13:53 2007 New Revision: 532105 URL: http://svn.apache.org/viewvc?view=revrev=532105 Log: Prevent NPE when working with small, possibly empty indexes. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diffrev=532105r1=532104r2=532105 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Tue Apr 24 15:13:53 2007 @@ -158,19 +158,28 @@ public class DDRecordReader implements RecordReader { private IndexReader indexReader; - private int maxDoc; - private int doc; + private int maxDoc = 0; + private int doc = 0; private Text index; public DDRecordReader(FileSplit split, JobConf job, Text index) throws IOException { -indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), split.getPath(), false, job)); -maxDoc = indexReader.maxDoc(); +try { + indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), split.getPath(), false, job)); + maxDoc = indexReader.maxDoc(); +} catch (IOException ioe) { + LOG.warn(Can't open index at + split + , skipping. ( + ioe.getMessage() + )); + indexReader = null; +} this.index = index; } public boolean next(Writable key, Writable value) throws IOException { + +// skip empty indexes +if (indexReader == null || maxDoc = 0) + return false; // skip deleted documents while (indexReader.isDeleted(doc) doc maxDoc) doc++;
svn commit: r526455 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
Author: ab Date: Sat Apr 7 09:44:02 2007 New Revision: 526455 URL: http://svn.apache.org/viewvc?view=revrev=526455 Log: Empty MapWritable would throw an NPE when building a keySet. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?view=diffrev=526455r1=526454r2=526455 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Sat Apr 7 09:44:02 2007 @@ -175,6 +175,7 @@ public Set keySet() { HashSet set = new HashSet(); +if (isEmpty()) return set; set.add(fFirst.fKey); KeyValueEntry entry = fFirst; while ((entry = entry.fNextEntry) != null) {
svn commit: r521933 - in /lucene/nutch/trunk: ./ lib/ lib/native/Linux-i386-32/ src/test/org/apache/nutch/indexer/
Author: ab Date: Fri Mar 23 15:59:01 2007 New Revision: 521933 URL: http://svn.apache.org/viewvc?view=revrev=521933 Log: Upgrade to Hadoop 0.12.2 release. Fix whitespace issues in platform name in bin/hadoop under Cygwin. Replace deprecated method call. Added: lucene/nutch/trunk/lib/hadoop-0.12.2-core.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=521933r1=521932r2=521933 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 23 15:59:01 2007 @@ -169,6 +169,8 @@ 57. NUTCH-246 - Incorrect segment size being generated due to time synchronization issue (Stefan Groschupf via ab) +58. Upgrade to Hadoop 0.12.2 release. (ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diffrev=521933r1=521932r2=521933 == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Mar 23 15:59:01 2007 @@ -75,6 +75,8 @@ /unjar untar src=${build.dir}/hadoop/bin.tgz dest=bin compression=gzip/ +!-- fix broken library paths with spaces -- +replace file=bin/hadoop token=PlatformName value=PlatformName | sed -e 's/ /_/g'/ chmod dir=bin perm=ugo+rx includes=*.sh,hadoop/ !-- unpack hadoop webapp from hadoop jar into build directory -- Added: lucene/nutch/trunk/lib/hadoop-0.12.2-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.12.2-core.jar?view=autorev=521933 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.12.2-core.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=diffrev=521933r1=521932r2=521933 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?view=diffrev=521933r1=521932r2=521933 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?view=diffrev=521933r1=521932r2=521933 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?view=diffrev=521933r1=521932r2=521933 == Binary files - no diff available. Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diffrev=521933r1=521932r2=521933 == --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Fri Mar 23 15:59:01 2007 @@ -57,7 +57,7 @@ private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, part-); -Directory dir = FSDirectory.getDirectory(sub.toString(), true); +Directory dir = FSDirectory.getDirectory(sub.toString()); IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true); Document doc = makeDoc(name, MD5Hash.digest(1).toString(),
svn commit: r521182 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java
Author: ab Date: Thu Mar 22 03:08:00 2007 New Revision: 521182 URL: http://svn.apache.org/viewvc?view=revrev=521182 Log: NUTCH-246 - incorrect segment size being generated due to time synchronization issue. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=521182r1=521181r2=521182 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 22 03:08:00 2007 @@ -166,6 +166,9 @@ 56. Upgrade to Hadoop 0.12.1 release. (ab) +57. NUTCH-246 - Incorrect segment size being generated due to time +synchronization issue (Stefan Groschupf via ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?view=diffrev=521182r1=521181r2=521182 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Mar 22 03:08:00 2007 @@ -51,6 +51,7 @@ private JobConf jobConf; private URLFilters filters; private ScoringFilters scfilters; +private long curTime; public void configure(JobConf job) { this.jobConf = job; @@ -59,6 +60,7 @@ filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat(db.score.injected, 1.0f); + curTime = job.getLong(injector.current.time, System.currentTimeMillis()); } public void close() {} @@ -79,6 +81,7 @@ if (url != null) { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, interval); +datum.setFetchTime(curTime); datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); @@ -96,7 +99,7 @@ /** Combine multiple new entries for a url. */ public static class InjectReducer implements Reducer { -public void configure(JobConf job) {} +public void configure(JobConf job) {} public void close() {} public void reduce(WritableComparable key, Iterator values, @@ -155,6 +158,7 @@ sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); +sortJob.setLong(injector.current.time, System.currentTimeMillis()); JobClient.runJob(sortJob); // merge with existing crawl db
svn commit: r520154 - in /lucene/nutch/trunk: ./ lib/ lib/native/Linux-i386-32/
Author: ab Date: Mon Mar 19 16:02:56 2007 New Revision: 520154 URL: http://svn.apache.org/viewvc?view=revrev=520154 Log: Update to Hadoop 0.12.1. Added: lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar (with props) lucene/nutch/trunk/lib/jets3t-0.5.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.11.2-core.jar lucene/nutch/trunk/lib/jets3t.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=520154r1=520153r2=520154 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 19 16:02:56 2007 @@ -163,6 +163,9 @@ 55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL path is empty (kubes) + +56. Upgrade to Hadoop 0.12.1 release. (ab) + Release 0.8 - 2006-07-25 Added: lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar?view=autorev=520154 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar -- svn:executable = * Propchange: lucene/nutch/trunk/lib/hadoop-0.12.1-core.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/jets3t-0.5.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jets3t-0.5.0.jar?view=autorev=520154 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/jets3t-0.5.0.jar -- svn:executable = * Propchange: lucene/nutch/trunk/lib/jets3t-0.5.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=diffrev=520154r1=520153r2=520154 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?view=diffrev=520154r1=520153r2=520154 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?view=diffrev=520154r1=520153r2=520154 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?view=diffrev=520154r1=520153r2=520154 == Binary files - no diff available.
svn commit: r517382 - in /lucene/nutch/trunk/contrib/web2/plugins: web-caching-oscache/ web-caching-oscache/src/conf/ web-clustering/ web-keymatch/ web-more/ web-more/src/conf/ web-query-propose-ontol
Author: ab Date: Mon Mar 12 13:35:37 2007 New Revision: 517382 URL: http://svn.apache.org/viewvc?view=revrev=517382 Log: Fix inconsistent end-of-line style. Discovered this when trying to import to a separate subversion repo. Modified: lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/src/conf/tiles-defs.xml lucene/nutch/trunk/contrib/web2/plugins/web-clustering/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-clustering/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-keymatch/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-more/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-more/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-more/src/conf/tiles-defs.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-ontology/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml lucene/nutch/trunk/contrib/web2/plugins/web-resources/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-resources/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-subcollection/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-subcollection/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-subcollection/src/conf/tiles-defs.xml Modified: lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/build.xml?view=diffrev=517382r1=517381r2=517382 == --- lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/build.xml (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/build.xml Mon Mar 12 13:35:37 2007 @@ -1,4 +1,4 @@ -?xml version=1.0? +?xml version=1.0? !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with @@ -14,21 +14,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --- -project name=web-caching-oscache default=jar-core - - import file=../build-plugin.xml / - property name=nutch.root location=${root}/../../../../ / - - target name=init-plugin -echoCopying UI configuration/echo -copy todir=${build.classes} - fileset dir=src/conf includes=**/* / -/copy -echoCopying UI templates/echo -copy todir=${deploy.dir}/web - fileset dir=src/web includes=**/* / -/copy - /target - -/project +-- +project name=web-caching-oscache default=jar-core + + import file=../build-plugin.xml / + property name=nutch.root location=${root}/../../../../ / + + target name=init-plugin +echoCopying UI configuration/echo +copy todir=${build.classes} + fileset dir=src/conf includes=**/* / +/copy +echoCopying UI templates/echo +copy todir=${deploy.dir}/web + fileset dir=src/web includes=**/* / +/copy + /target + +/project Modified: lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/plugin.xml?view=diffrev=517382r1=517381r2=517382 == --- lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/plugin.xml (original) +++ lucene/nutch/trunk/contrib/web2/plugins/web-caching-oscache/plugin.xml Mon Mar 12 13:35:37 2007 @@ -1,4 +1,4 @@ -?xml version=1.0 encoding=UTF-8? +?xml version=1.0 encoding=UTF-8? !-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with @@ -16,25 +16,25 @@ limitations under the License. -- plugin id=web-caching-oscache name=Search result caching - version=1.0.0 provider-name=apache.org - - runtime -library name=web-caching-oscache.jar - export name=* / -/library - -library name=oscache-2.1.jar / - /runtime - - requires + version=1.0.0 provider-name=apache.org + + runtime +library name=web-caching-oscache.jar + export name=* / +/library + +library name=oscache-2.1.jar / + /runtime + + requires import plugin=webui-extensionpoints / - /requires - + /requires + extension id=org.apache.nutch.webapp.extension.UIExtensionPoint name=Nutch ui extension point -point=org.apache.nutch.webapp.extension.UIExtensionPoint +point=org.apache.nutch.webapp.extension.UIExtensionPoint implementation id=web-caching-oscache - class
svn commit: r516387 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Author: ab Date: Fri Mar 9 04:27:18 2007 New Revision: 516387 URL: http://svn.apache.org/viewvc?view=revrev=516387 Log: Add the number of active threads to the status report. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diffrev=516387r1=516386r2=516387 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Fri Mar 9 04:27:18 2007 @@ -727,7 +727,7 @@ private void reportStatus() throws IOException { String status; long elapsed = (System.currentTimeMillis() - start)/1000; -status = +status = activeThreads + threads, + pages+ pages, +errors+ errors, + Math.round(((float)pages.get()*10)/elapsed)/10.0+ pages/s, + Math.round(float)bytes.get())*8)/1024)/elapsed)+ kb/s, ;
svn commit: r515698 - in /lucene/nutch/trunk: CHANGES.txt bin/nutch
Author: ab Date: Wed Mar 7 11:02:56 2007 New Revision: 515698 URL: http://svn.apache.org/viewvc?view=revrev=515698 Log: NUTCH-432 - JAVA_PLATFORM with spaces breaks bin/nutch. Also, apply the patch proposed in HADOOP-1080 to fix CLASSPATH problems under Cygwin. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=515698r1=515697r2=515698 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 7 11:02:56 2007 @@ -148,6 +148,9 @@ 49. NUTCH-449 - Make junit output format configurable. (nigel via cutting) +50. NUTCH-432 - Fix a bug where platform name with spaces would break the +bin/nutch script. (Brian Whitman via ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diffrev=515698r1=515697r2=515698 == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Wed Mar 7 11:02:56 2007 @@ -125,11 +125,15 @@ CLASSPATH=${CLASSPATH}:$f; done +# cygwin path translation +if $cygwin; then + CLASSPATH=`cygpath -p -w $CLASSPATH` +fi # setup 'java.library.path' for native-hadoop code if necessary JAVA_LIBRARY_PATH='' if [ -d ${NUTCH_HOME}/build/native -o -d ${NUTCH_HOME}/lib/native ]; then - JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName` + JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'` if [ -d $NUTCH_HOME/build/native ]; then JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib @@ -144,6 +148,10 @@ fi fi +if [ $cygwin -a X${JAVA_LIBRARY_PATH} != X ]; then + JAVA_LIBRARY_PATH=`cygpath -p -w $JAVA_LIBRARY_PATH` +fi + # restore ordinary behaviour unset IFS @@ -215,11 +223,6 @@ CLASS='org.apache.nutch.searcher.DistributedSearch$Server' else CLASS=$COMMAND -fi - -# cygwin path translation -if $cygwin; then - CLASSPATH=`cygpath -p -w $CLASSPATH` fi # run it
svn commit: r515791 - in /lucene/nutch/trunk: ./ lib/ lib/native/Linux-i386-32/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apac
Author: ab Date: Wed Mar 7 13:59:07 2007 New Revision: 515791 URL: http://svn.apache.org/viewvc?view=revrev=515791 Log: Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 releases. Added: lucene/nutch/trunk/lib/hadoop-0.11.2-core.jar (with props) lucene/nutch/trunk/lib/lucene-core-2.1.0.jar (with props) lucene/nutch/trunk/lib/lucene-misc-2.1.0.jar (with props) lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.1.0.jar (with props) lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.1.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar lucene/nutch/trunk/lib/lucene-core-2.0.0.jar lucene/nutch/trunk/lib/lucene-misc-2.0.0.jar lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=515791r1=515790r2=515791 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 7 13:59:07 2007 @@ -151,6 +151,8 @@ 50. NUTCH-432 - Fix a bug where platform name with spaces would break the bin/nutch script. (Brian Whitman via ab) +51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. + Release 0.8 - 2006-07-25 Added: lucene/nutch/trunk/lib/hadoop-0.11.2-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.11.2-core.jar?view=autorev=515791 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.11.2-core.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-core-2.1.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-2.1.0.jar?view=autorev=515791 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-2.1.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-2.1.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-2.1.0.jar?view=autorev=515791 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-2.1.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=diffrev=515791r1=515790r2=515791 == Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?view=diffrev=515791r1=515790r2=515791
svn commit: r507504 - in /lucene/nutch/trunk/src/java/org/apache/nutch/parse: Outlink.java ParseSegment.java
Author: ab Date: Wed Feb 14 04:15:05 2007 New Revision: 507504 URL: http://svn.apache.org/viewvc?view=revrev=507504 Log: Outlink: when null anchor is supplied replace it with an empty string. ParseSegment: store segment name in parts that we produce here. Content is only read, not stored as one of the outputs. Failure to do that results in NPE in Indexer. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?view=diffrev=507504r1=507503r2=507504 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Wed Feb 14 04:15:05 2007 @@ -34,6 +34,7 @@ public Outlink(String toUrl, String anchor, Configuration conf) throws MalformedURLException { this.toUrl = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK).normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); +if (anchor == null) anchor = ; this.anchor = anchor; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diffrev=507504r1=507503r2=507504 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Feb 14 04:15:05 2007 @@ -81,7 +81,10 @@ // compute the new signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); -content.getMetadata().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); +if (parse != null) { + parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); + parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY)); +} if (status.isSuccess()) { try { @@ -95,7 +98,7 @@ } output.collect(key, new ParseImpl(parse.getText(), parse.getData())); } else if (LOG.isWarnEnabled()) { - LOG.warn(Error parsing: +key+: +status.toString()); + LOG.warn(Error parsing: + key + : +status.toString()); } } @@ -116,9 +119,8 @@ job.setJobName(parse + segment); job.setInputPath(new Path(segment, Content.DIR_NAME)); +job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setInputFormat(SequenceFileInputFormat.class); -job.setInputKeyClass(Text.class); -job.setInputValueClass(Content.class); job.setMapperClass(ParseSegment.class); job.setReducerClass(ParseSegment.class);
svn commit: r499944 - /lucene/nutch/trunk/CHANGES.txt
Author: ab Date: Thu Jan 25 12:15:34 2007 New Revision: 499944 URL: http://svn.apache.org/viewvc?view=revrev=499944 Log: Mention the addition of Fetcher2. Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=499944r1=499943r2=499944 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jan 25 12:15:34 2007 @@ -142,6 +142,8 @@ 46. NUTCH-433 - java.io.EOFException in newer nightlies in mergesegs or indexing from hadoop.io.DataOutputBuffer (siren) +47. NUTCH-339 - Fetcher2: a queue-based fetcher implementation. (ab) + Release 0.8 - 2006-07-25
svn commit: r497141 - in /lucene/nutch/trunk: CHANGES.txt bin/nutch src/java/org/apache/nutch/tools/FreeGenerator.java
Author: ab Date: Wed Jan 17 11:55:07 2007 New Revision: 497141 URL: http://svn.apache.org/viewvc?view=revrev=497141 Log: NUTCH-68 - ported to use map-reduce. Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (with props) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=497141r1=497140r2=497141 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jan 17 11:55:07 2007 @@ -137,6 +137,8 @@ 44. NUTCH-430 - Integer overflow in HashComparator.compare (siren) +45. NUTCH-68 - Add a tool to generate arbitrary fetchlists. (ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diffrev=497141r1=497140r2=497141 == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Wed Jan 17 11:55:07 2007 @@ -38,7 +38,8 @@ echo mergedb merge crawldb-s, with optional filtering echo readlinkdbread / dump link db echo injectinject new urls into the database - echo generate generate new segments to fetch + echo generate generate new segments to fetch from crawl db + echo freegen generate new segments to fetch from text files echo fetch fetch a segment's pages echo parse parse a segment's pages echo readseg read / dump segment data @@ -172,6 +173,8 @@ CLASS=org.apache.nutch.crawl.Injector elif [ $COMMAND = generate ] ; then CLASS=org.apache.nutch.crawl.Generator +elif [ $COMMAND = freegen ] ; then + CLASS=org.apache.nutch.tools.FreeGenerator elif [ $COMMAND = fetch ] ; then CLASS=org.apache.nutch.fetcher.Fetcher elif [ $COMMAND = parse ] ; then Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?view=autorev=497141 == --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Wed Jan 17 11:55:07 2007 @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tools; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.ToolBase; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Generator; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +/** + * This tool generates fetchlists (segments to be fetched) from plain text + * files containing one URL per line. It's useful when arbitrary URL-s need to + * be fetched without adding them first to the CrawlDb, or during testing. + * + * @author Andrzej Bialecki + */ +public class FreeGenerator extends ToolBase { + private static final Log LOG = LogFactory.getLog(FreeGenerator.class); + + private static final String FILTER_KEY = free.generator.filter
svn commit: r497172 - in /lucene/nutch/trunk: bin/nutch src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/fetcher/Fetcher2.java
Author: ab Date: Wed Jan 17 13:06:50 2007 New Revision: 497172 URL: http://svn.apache.org/viewvc?view=revrev=497172 Log: Revert accidental change to bin/nutch. Fix Fetcher.java to correctly split input. Add Fetcher2 - a queue-based fetcher implementation. Added: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (with props) Modified: lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diffrev=497172r1=497171r2=497172 == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Wed Jan 17 13:06:50 2007 @@ -41,6 +41,7 @@ echo generate generate new segments to fetch from crawl db echo freegen generate new segments to fetch from text files echo fetch fetch a segment's pages + echo fetch2fetch a segment's pages using Fetcher2 implementation echo parse parse a segment's pages echo readseg read / dump segment data echo mergesegs merge several segments, with optional filtering and slicing @@ -177,6 +178,8 @@ CLASS=org.apache.nutch.tools.FreeGenerator elif [ $COMMAND = fetch ] ; then CLASS=org.apache.nutch.fetcher.Fetcher +elif [ $COMMAND = fetch2 ] ; then + CLASS=org.apache.nutch.fetcher.Fetcher2 elif [ $COMMAND = parse ] ; then CLASS=org.apache.nutch.parse.ParseSegment elif [ $COMMAND = readdb ] ; then @@ -220,6 +223,5 @@ fi # run it -echo $JAVA $JAVA_HEAP_MAX $NUTCH_OPTS -classpath $CLASSPATH $CLASS $@ exec $JAVA $JAVA_HEAP_MAX $NUTCH_OPTS -classpath $CLASSPATH $CLASS $@ Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diffrev=497172r1=497171r2=497172 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jan 17 13:06:50 2007 @@ -48,9 +48,10 @@ public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ -public InputSplit[] getSplits(FileSystem fs, JobConf job, int nSplits) +public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { Path[] files = listPaths(job); + FileSystem fs = FileSystem.get(job); InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i files.length; i++) { splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]), job); Added: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=autorev=497172 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Wed Jan 17 13:06:50 2007 @@ -0,0 +1,875 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.fetcher; + +import java.io.IOException; +import java.net.InetAddress; +import java.net.URL; +import java.net.UnknownHostException; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.mapred.*; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.*; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.*; + + +/** + * A queue-based fetcher. + * + * pThis fetcher uses a well-known model of one producer (a QueueFeeder) + * and many consumers (FetcherThread-s
svn commit: r496535 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Mon Jan 15 15:07:15 2007 New Revision: 496535 URL: http://svn.apache.org/viewvc?view=revrev=496535 Log: Pick the right entry, as indicated by the same generate time. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diffrev=496535r1=496534r2=496535 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Jan 15 15:07:15 2007 @@ -299,6 +299,12 @@ * Update the CrawlDB so that the next generate won't include the same URLs. */ public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer { +long generateTime; + +public void configure(JobConf job) { + generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); +} + public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { if (key instanceof FloatWritable) { // tempDir source SelectorEntry se = (SelectorEntry)value; @@ -315,6 +321,11 @@ CrawlDatum val = (CrawlDatum)values.next(); if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) { genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY); + if (genTime.get() != generateTime) { +orig = val; +genTime = null; +continue; + } } else { orig = val; } @@ -384,7 +395,8 @@ } job.setLong(CRAWL_GEN_CUR_TIME, curTime); // record real generation time -job.setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis()); +long generateTime = System.currentTimeMillis(); +job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(CRAWL_TOP_N, topN); job.setBoolean(CRAWL_GENERATE_FILTER, filter); @@ -453,6 +465,7 @@ job = new NutchJob(getConf()); job.setJobName(generate: updatedb + dbDir); + job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.addInputPath(tempDir); job.addInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); @@ -492,7 +505,7 @@ } /** - * Generate a fetchlist from the pagedb and linkdb + * Generate a fetchlist from the crawldb. */ public static void main(String args[]) throws Exception { int res = new Generator().doMain(NutchConfiguration.create(), args);
svn commit: r495214 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Crawl.java src/java/org/apache/nutch/indexer/Indexer.java
Author: ab Date: Thu Jan 11 05:25:43 2007 New Revision: 495214 URL: http://svn.apache.org/viewvc?view=revrev=495214 Log: When indexing redirected pages, drop intermediate pages and only index the final page. Avoid NPEs in Crawl tool, when no URLs are generated or fetched. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=495214r1=495213r2=495214 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 05:25:43 2007 @@ -123,6 +123,9 @@ 39. NUTCH-421 - Allow predeterminate running order of indexing filters (Alan Tanaman, siren) +40. When indexing pages with redirection, drop all intermediate pages and +index only the final page. (ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diffrev=495214r1=495213r2=495214 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 11 05:25:43 2007 @@ -113,8 +113,8 @@ // initialize crawlDb injector.inject(crawlDb, rootUrlDir); - -for (int i = 0; i depth; i++) { // generate new segment +int i; +for (i = 0; i depth; i++) { // generate new segment Path segment = generator.generate(crawlDb, segments, -1, topN, System .currentTimeMillis(), false, false); if (segment == null) { @@ -127,14 +127,16 @@ } crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb } - -linkDbTool.invert(linkDb, segments, true, true, false); // invert links - -// index, dedup merge -indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments)); -dedup.dedup(new Path[] { indexes }); -merger.merge(fs.listPaths(indexes), index, tmpDir); +if (i 0) { + linkDbTool.invert(linkDb, segments, true, true, false); // invert links + // index, dedup merge + indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments)); + dedup.dedup(new Path[] { indexes }); + merger.merge(fs.listPaths(indexes), index, tmpDir); +} else { + LOG.warn(No URLs to fetch - check your seed list and URL filters.); +} if (LOG.isInfoEnabled()) { LOG.info(crawl finished: + dir); } } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diffrev=495214r1=495213r2=495214 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Jan 11 05:25:43 2007 @@ -182,6 +182,7 @@ Inlinks inlinks = null; CrawlDatum dbDatum = null; CrawlDatum fetchDatum = null; +CrawlDatum redir = null; ParseData parseData = null; ParseText parseText = null; while (values.hasNext()) { @@ -194,6 +195,9 @@ dbDatum = datum; else if (CrawlDatum.hasFetchStatus(datum)) fetchDatum = datum; +else if (CrawlDatum.STATUS_LINKED == datum.getStatus()) + // redirected page + redir = datum; else throw new RuntimeException(Unexpected status: +datum.getStatus()); } else if (value instanceof ParseData) { @@ -204,6 +208,11 @@ LOG.warn(Unrecognized type: +value.getClass()); } } +if (redir != null) { + // XXX page was redirected - what should we do? + // XXX discard it for now + return; +} if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
svn commit: r495397 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/DeleteDuplicates.java src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Author: ab Date: Thu Jan 11 14:00:51 2007 New Revision: 495397 URL: http://svn.apache.org/viewvc?view=revrev=495397 Log: Fix NUTCH-420 - DeleteDuplicates depended on the order of IndexDoc processing.. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=495397r1=495396r2=495397 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 14:00:51 2007 @@ -128,6 +128,9 @@ 41. Upgrade to Hadoop 0.10.1. (ab) +42. NUTCH-420 - Fix a bug in DeleteDuplicates where results depended on the +order in which IndexDoc-s are processed. (Dogacan Guney via ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diffrev=495397r1=495396r2=495397 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jan 11 14:00:51 2007 @@ -311,22 +311,25 @@ highest = value; continue; } -if (byScore) { - if (value.score highest.score) { -highest.keep = false; -LOG.debug(-discard + highest + , keep + value); -output.collect(highest.url, highest); // delete highest -highest = value; - } +IndexDoc toDelete = null, toKeep = null; +boolean metric = byScore ? (value.score highest.score) : + (value.urlLen highest.urlLen); +if (metric) { + toDelete = highest; + toKeep = value; } else { - if (value.urlLen highest.urlLen) { -highest.keep = false; -LOG.debug(-discard + highest + , keep + value); -output.collect(highest.url, highest); // delete highest -highest = value; - } + toDelete = value; + toKeep = highest; } - } + +if (LOG.isDebugEnabled()) { + LOG.debug(-discard + toDelete + , keep + toKeep); +} + +toDelete.keep = false; +output.collect(toDelete.url, toDelete); +highest = toKeep; + } LOG.debug(-keep + highest); // no need to add this - in phase 2 we only process docs to delete them // highest.keep = true; Modified: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=diffrev=495397r1=495396r2=495397 == --- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Thu Jan 11 14:00:51 2007 @@ -41,6 +41,7 @@ Path root; Path index1; Path index2; + Path index3; public void setUp() throws Exception { conf = NutchConfiguration.create(); @@ -48,11 +49,12 @@ fs = FileSystem.get(conf); root = new Path(build/test/dedup2-test- + new Random().nextInt()); // create test indexes -index1 = createIndex(index1, true, 1.0f, 10L); -index2 = createIndex(index2, false, 2.0f, 20L); +index1 = createIndex(index1, true, 1.0f, 10L, false); +index2 = createIndex(index2, false, 2.0f, 20L, true); +index3 = createIndex(index3, true, 1.0f, 10L, true); } - private Path createIndex(String name, boolean hashDup, float inc, long time) throws Exception { + private Path createIndex(String name, boolean hashDup, float inc, long time, boolean incFirst) throws Exception { Path idx = new Path(root, name); Path sub = new Path(idx, part-); Directory dir = FSDirectory.getDirectory(sub.toString(), true); @@ -60,18 +62,18 @@ Document doc = makeDoc(name, MD5Hash.digest(1).toString(), http://www.example.com/1;, -1.0f, time); +1.0f + (incFirst ? inc : 0.0f), time); writer.addDocument(doc); if (hashDup) { doc = makeDoc(name, MD5Hash.digest(1).toString(), http://www.example.com/2;, - 1.0f + inc, time + 1); + 1.0f + (!incFirst ? inc : 0.0f), time + 1); } else { doc = makeDoc(name, MD5Hash.digest(2).toString(), http://www.example.com/1;, - 1.0f + inc, time + 1); + 1.0f
svn commit: r493085 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Author: ab Date: Fri Jan 5 08:58:29 2007 New Revision: 493085 URL: http://svn.apache.org/viewvc?view=revrev=493085 Log: Fix NUTCH-425 and NUTCH-426. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=493085r1=493084r2=493085 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Jan 5 08:58:29 2007 @@ -114,6 +114,9 @@ 36. Fix Injector to preserve already existing CrawlDatum if the seed list being injected also contains such URL. (ab) +37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after +skipping bad URLs. (Michael Stack via ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diffrev=493085r1=493084r2=493085 == --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Fri Jan 5 08:58:29 2007 @@ -20,6 +20,7 @@ import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; +import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; @@ -106,7 +107,7 @@ // if (LOG.isInfoEnabled()) { // LOG.info(script: language= + lang + , text: + script.toString()); // } - Outlink[] links = getJSLinks(script.toString(), base, base); + Outlink[] links = getJSLinks(script.toString(), , base); if (links != null links.length 0) outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. return; @@ -123,11 +124,11 @@ Node anode = attrs.item(i); Outlink[] links = null; if (anode.getNodeName().startsWith(on)) { -links = getJSLinks(anode.getNodeValue(), base, base); +links = getJSLinks(anode.getNodeValue(), , base); } else if (anode.getNodeName().equalsIgnoreCase(href)) { String val = anode.getNodeValue(); if (val != null val.toLowerCase().indexOf(javascript:) != -1) { - links = getJSLinks(val, base, base); + links = getJSLinks(val, , base); } } if (links != null links.length 0) outlinks.addAll(Arrays.asList(links)); @@ -146,7 +147,7 @@ return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT, Content not JavaScript: ' + type + ').getEmptyParse(getConf()); String script = new String(c.getContent()); -Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl()); +Outlink[] outlinks = getJSLinks(script, , c.getUrl()); if (outlinks == null) outlinks = new Outlink[0]; // Title? use the first line of the script... String title; @@ -212,7 +213,19 @@ } if (url.startsWith(www.)) { url = http://; + url; -} else url = new URL(baseURL, url).toString(); +} else { + // See if candidate URL is parseable. If not, pass and move on to + // the next match. + try { +url = new URL(baseURL, url).toString(); + } catch (MalformedURLException ex) { +if (LOG.isTraceEnabled()) { + LOG.trace( - failed URL parse ' + url + ' and baseURL ' + + baseURL + ', ex); +} +continue; + } +} url = url.replaceAll(amp;, ); if (LOG.isTraceEnabled()) { LOG.trace( - outlink from JS: ' + url + '); @@ -249,7 +262,7 @@ while ((line = br.readLine()) != null) sb.append(line + \n); JSParseFilter parseFilter = new JSParseFilter(); parseFilter.setConf(NutchConfiguration.create()); -Outlink[] links = parseFilter.getJSLinks(sb.toString(), args[1], args[1]); +Outlink[] links = parseFilter.getJSLinks(sb.toString(), , args[1]); System.out.println(Outlinks extracted: + links.length); for (int i = 0; i links.length; i++) System.out.println( - + links[i]);
svn commit: r487143 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
Author: ab Date: Thu Dec 14 00:53:08 2006 New Revision: 487143 URL: http://svn.apache.org/viewvc?view=revrev=487143 Log: Check if paths exist before deleting them. Reported by Renaud Richardet. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?view=diffrev=487143r1=487142r2=487143 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Dec 14 00:53:08 2006 @@ -315,11 +315,13 @@ FileSystem fs = new JobClient(job).getFs(); Path old = new Path(linkDb, old); Path current = new Path(linkDb, CURRENT_NAME); -fs.delete(old); -fs.rename(current, old); +if (fs.exists(current)) { + if (fs.exists(old)) fs.delete(old); + fs.rename(current, old); +} fs.mkdirs(linkDb); fs.rename(newLinkDb, current); -fs.delete(old); +if (fs.exists(old)) fs.delete(old); } public static void main(String[] args) throws Exception {
svn commit: r487145 - in /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl: CrawlDb.java LinkDb.java
Author: ab Date: Thu Dec 14 01:06:56 2006 New Revision: 487145 URL: http://svn.apache.org/viewvc?view=revrev=487145 Log: Check if paths exist before deleting them. Reported by Renaud Richardet. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/LinkDb.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diffrev=487145r1=487144r2=487145 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Dec 14 01:06:56 2006 @@ -98,11 +98,13 @@ FileSystem fs = new JobClient(job).getFs(); Path old = new Path(crawlDb, old); Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME); -fs.delete(old); -fs.rename(current, old); +if (fs.exists(current)) { + if (fs.exists(old)) fs.delete(old); + fs.rename(current, old); +} fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); -fs.delete(old); +if (fs.exists(old)) fs.delete(old); } public static void main(String[] args) throws Exception { Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/LinkDb.java?view=diffrev=487145r1=487144r2=487145 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/LinkDb.java Thu Dec 14 01:06:56 2006 @@ -279,11 +279,13 @@ FileSystem fs = new JobClient(job).getFs(); Path old = new Path(linkDb, old); Path current = new Path(linkDb, CURRENT_NAME); -fs.delete(old); -fs.rename(current, old); +if (fs.exists(current)) { + if (fs.exists(old)) fs.delete(old); + fs.rename(current, old); +} fs.mkdirs(linkDb); fs.rename(newLinkDb, current); -fs.delete(old); +if (fs.exists(old)) fs.delete(old); } public static void main(String[] args) throws Exception {
svn commit: r485587 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: ab Date: Mon Dec 11 02:04:59 2006 New Revision: 485587 URL: http://svn.apache.org/viewvc?view=revrev=485587 Log: Remove misplaced cast, which sometimes lead to an overflow. Close readers when done - when using local FS this would prevent us from deleting temporary dirs. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?view=diffrev=485587r1=485586r2=485587 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Dec 11 02:04:59 2006 @@ -245,8 +245,6 @@ job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); -job.setInputKeyClass(Text.class); -job.setInputValueClass(CrawlDatum.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); @@ -286,6 +284,7 @@ val.set(val.get() + value.get()); } } + reader.close(); } if (LOG.isInfoEnabled()) { @@ -302,7 +301,7 @@ } else if (k.equals(scx)) { LOG.info(max score:\t + (float) (val.get() / 1000.0f)); } else if (k.equals(sct)) { - LOG.info(avg score:\t + (float) ((float) (val.get() / (float)totalCnt.get()) / 1000.0f)); + LOG.info(avg score:\t + (float) ((float) (val.get() / totalCnt.get()) / 1000.0f)); } else if (k.startsWith(status)) { int code = Integer.parseInt(k.substring(k.indexOf(' ') + 1)); LOG.info(k + ( + CrawlDatum.statNames[code] + ):\t + val);
svn commit: r483420 - in /lucene/nutch/trunk: lib/hadoop-0.7.1.jar lib/hadoop-0.9.1.jar src/java/org/apache/nutch/crawl/CrawlDb.java src/java/org/apache/nutch/parse/ParseOutputFormat.java src/test/org
Author: ab Date: Thu Dec 7 03:21:08 2006 New Revision: 483420 URL: http://svn.apache.org/viewvc?view=revrev=483420 Log: Upgrade to Hadoop 0.9.1 . Added: lucene/nutch/trunk/lib/hadoop-0.9.1.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.7.1.jar Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Added: lucene/nutch/trunk/lib/hadoop-0.9.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.9.1.jar?view=autorev=483420 == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.9.1.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diffrev=483420r1=483419r2=483420 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Dec 7 03:21:08 2006 @@ -115,11 +115,13 @@ FileSystem fs = new JobClient(job).getFs(); Path old = new Path(crawlDb, old); Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME); -fs.delete(old); -fs.rename(current, old); +if (fs.exists(current)) { + if (fs.exists(old)) fs.delete(old); + fs.rename(current, old); +} fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); -fs.delete(old); +if (fs.exists(old)) fs.delete(old); } public static void main(String[] args) throws Exception { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diffrev=483420r1=483419r2=483420 == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Dec 7 03:21:08 2006 @@ -22,6 +22,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.*; +import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.fetcher.Fetcher; import org.apache.hadoop.fs.*; @@ -68,13 +69,13 @@ new Path(new Path(job.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), name); final MapFile.Writer textOut = - new MapFile.Writer(fs, text.toString(), Text.class, ParseText.class); + new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class, CompressionType.RECORD); final MapFile.Writer dataOut = - new MapFile.Writer(fs, data.toString(), Text.class,ParseData.class,true); + new MapFile.Writer(job, fs, data.toString(), Text.class,ParseData.class); final SequenceFile.Writer crawlOut = - new SequenceFile.Writer(fs, crawl, Text.class, CrawlDatum.class); + SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class); return new RecordWriter() { Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java?view=diffrev=483420r1=483419r2=483420 == --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Thu Dec 7 03:21:08 2006 @@ -106,8 +106,8 @@ FileSystem fs = FileSystem.get(configuration); Path file = new Path(System.getProperty(java.io.tmpdir), mapTestFile); fs.delete(file); -org.apache.hadoop.io.SequenceFile.Writer writer = new SequenceFile.Writer( -fs, file, IntWritable.class, MapWritable.class); +org.apache.hadoop.io.SequenceFile.Writer writer = SequenceFile.createWriter( +fs, configuration, file, IntWritable.class, MapWritable.class); // write map System.out.println(start writing map's); long start = System.currentTimeMillis(); @@ -139,8 +139,8 @@ fs.delete(file); // Text -System.out.println(start writing utf8's); -writer = new SequenceFile.Writer(fs, file, IntWritable.class, Text.class); +System.out.println(start writing Text's); +writer = SequenceFile.createWriter(fs, configuration, file, IntWritable.class, Text.class); // write map start = System.currentTimeMillis(); key = new IntWritable(); @@ -153,17 +153,17
svn commit: r482674 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/protocol/ plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ plugin/protocol-file/src/j
Author: ab Date: Tue Dec 5 06:34:13 2006 New Revision: 482674 URL: http://svn.apache.org/viewvc?view=revrev=482674 Log: Refactor robots.txt checking so that it's protocol independent. Make blocking and robots checking optional inside lib-http. This is needed for alternative Fetcher implementations, which may handle these aspects outside the protocol plugins. Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java (with props) lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diffrev=482674r1=482673r2=482674 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Dec 5 06:34:13 2006 @@ -434,8 +434,6 @@ job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); -job.setInputKeyClass(Text.class); -job.setInputValueClass(CrawlDatum.class); job.setMapRunnerClass(Fetcher.class); Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java?view=autorev=482674 == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java Tue Dec 5 06:34:13 2006 @@ -0,0 +1,26 @@ +/* + * Created on Aug 4, 2006 + * Author: Andrzej Bialecki lt;[EMAIL PROTECTED]gt; + * + */ +package org.apache.nutch.protocol; + +import java.net.URL; + +public class EmptyRobotRules implements RobotRules { + + public static final RobotRules RULES = new EmptyRobotRules(); + + public long getCrawlDelay() { +return -1; + } + + public long getExpireTime() { +return -1; + } + + public boolean isAllowed(URL url) { +return true; + } + +} Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java -- svn:eol-style = native Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?view=diffrev=482674r1=482673r2=482674 == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Tue Dec 5 06:34:13 2006 @@ -30,8 +30,34 @@ public interface Protocol extends Pluggable, Configurable { /** The name of the extension point. */ public final static String X_POINT_ID = Protocol.class.getName(); + + /** + * Property name. If in the current configuration this property is set to + * true, protocol implementations should handle politeness limits + * internally. If this is set to false, it is assumed that these limits are + * enforced elsewhere, and protocol implementations should not enforce them + * internally. + */ + public final static String CHECK_BLOCKING = protocol.plugin.check.blocking; + + /** + * Property name. If in the current configuration this property is set to + * true, protocol implementations should handle robot exclusion rules + * internally. If this is set to false, it is assumed that these limits are + * enforced elsewhere, and protocol implementations should not enforce them + * internally. + */ + public final static String CHECK_ROBOTS = protocol.plugin.check.robots; /** Returns the [EMAIL PROTECTED] Content} for a fetchlist entry. */ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); + + /** + * Retrieve robot rules applicable for this url. + * @param url url to check + * @param datum page datum + * @return robot rules (specific for this url or default), never null + */ + RobotRules getRobotRules(Text url, CrawlDatum datum); } Added: lucene/nutch/trunk/src/java/org/apache/nutch/protocol
svn commit: r480188 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/indexer/ java/org/apache/nutch/metadata/ java/org/apache/nutch/parse/ java/org/apache/nutch/segme
Author: ab Date: Tue Nov 28 12:14:58 2006 New Revision: 480188 URL: http://svn.apache.org/viewvc?view=revrev=480188 Log: Move some constants to Nutch.java, so that Metadata could use them properly. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diffrev=480188r1=480187r2=480188 == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Nov 28 12:14:58 2006 @@ -33,6 +33,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.*; import org.apache.nutch.protocol.*; import org.apache.nutch.parse.*; @@ -45,10 +46,6 @@ public static final Log LOG = LogFactory.getLog(Fetcher.class); - public static final String SIGNATURE_KEY = nutch.content.digest; - public static final String SEGMENT_NAME_KEY = nutch.segment.name; - public static final String SCORE_KEY = nutch.crawl.score; - public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits) @@ -268,7 +265,7 @@ } Metadata metadata = content.getMetadata(); // add segment to metadata - metadata.set(SEGMENT_NAME_KEY, segmentName); + metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); @@ -297,11 +294,11 @@ // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); -metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature)); +metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); datum.setSignature(signature); // Ensure segment name and score are in parseData metadata -parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName); -parse.getData().getContentMeta().set(SIGNATURE_KEY, StringUtil.toHexString(signature)); +parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); +parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); try { scfilters.passScoreAfterParsing(key, content, parse); } catch (Exception e) { @@ -359,7 +356,7 @@ public void configure(JobConf job) { setConf(job); -this.segmentName = job.get(SEGMENT_NAME_KEY); +this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY); this.storingContent = isStoringContent(job); this.parsing = isParsing(job); @@ -430,7 +427,7 @@ job.setJobName(fetch + segment); job.setInt(fetcher.threads.fetch, threads); -job.set(SEGMENT_NAME_KEY, segment.getName()); +job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diffrev=480188r1=480187r2=480188 == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Tue Nov 28 12:14:58 2006 @@ -47,6 +47,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.document.*; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; /** Create indexes for segments. */ public class Indexer extends ToolBase implements Reducer { @@ -220,11 +221,11 @@ Metadata metadata = parseData.getContentMeta(); // add segment, used to map from merged index back to segment files -doc.add(new Field(segment, metadata.get(Fetcher.SEGMENT_NAME_KEY), +doc.add(new Field(segment, metadata.get
svn commit: r480207 - in /lucene/nutch/trunk/src: java/org/apache/nutch/metadata/ java/org/apache/nutch/protocol/ plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ plugin/protocol-httpclie
Author: ab Date: Tue Nov 28 13:02:10 2006 New Revision: 480207 URL: http://svn.apache.org/viewvc?view=revrev=480207 Log: Use SpellCheckedMetadata only when necessary, i.e. only when collecting metadata from unreliable sources such as HTTP headers. * Metadata: fix a bug where SpellCheckedMetadata would try to normalize metadata names during (de)serialization. * Content: should use regular Metadata by default, and when de-serializing. * fix HTTP protocol plugins to use SpellCheckedMetadata, where it's really necessary. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diffrev=480207r1=480206r2=480207 == --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Nov 28 13:02:10 2006 @@ -92,6 +92,10 @@ * @return the values associated to a metadata name. */ public String[] getValues(final String name) { +return _getValues(name); + } + + private String[] _getValues(final String name) { String[] values = metadata.get(name); if (values == null) { values = new String[0]; @@ -174,8 +178,8 @@ String[] names = names(); for (int i = 0; i names.length; i++) { - String[] otherValues = other.getValues(names[i]); - String[] thisValues = getValues(names[i]); + String[] otherValues = other._getValues(names[i]); + String[] thisValues = _getValues(names[i]); if (otherValues.length != thisValues.length) { return false; } @@ -192,7 +196,7 @@ StringBuffer buf = new StringBuffer(); String[] names = names(); for (int i = 0; i names.length; i++) { - String[] values = getValues(names[i]); + String[] values = _getValues(names[i]); for (int j = 0; j values.length; j++) { buf.append(names[i]) .append(=) @@ -209,7 +213,7 @@ String[] names = names(); for (int i = 0; i names.length; i++) { Text.writeString(out, names[i]); - values = getValues(names[i]); + values = _getValues(names[i]); int cnt = 0; for (int j = 0; j values.length; j++) { if (values[j] != null) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diffrev=480207r1=480206r2=480207 == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Tue Nov 28 13:02:10 2006 @@ -31,7 +31,6 @@ import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.VersionMismatchException; import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.SpellCheckedMetadata; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypeException; @@ -97,7 +96,7 @@ protected final void readFieldsCompressed(DataInput in) throws IOException { version = in.readByte(); -metadata = new SpellCheckedMetadata(); +metadata = new Metadata(); switch (version) { case 0: case 1: Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?view=diffrev=480207r1=480206r2=480207 == --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Nov 28 13:02:10 2006 @@ -31,6 +31,7 @@ // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; import org.apache.nutch.net.protocols.Response; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpBase; @@ -47,7 +48,7 @@ private String base; private byte[] content; private int code; - private Metadata headers = new Metadata(); + private Metadata headers = new SpellCheckedMetadata
svn commit: r474756 - /lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
Author: ab Date: Tue Nov 14 04:11:30 2006 New Revision: 474756 URL: http://svn.apache.org/viewvc?view=revrev=474756 Log: NUTCH-401: use hadoop.tmp.dir instead of hardcoded /tmp. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diffrev=474756r1=474755r2=474756 == --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Tue Nov 14 04:11:30 2006 @@ -202,7 +202,7 @@ job.setMapperClass(InputCompatMapper.class); job.setReducerClass(SegmentReader.class); -Path tempDir = new Path(/tmp/segread- + new java.util.Random().nextInt()); +Path tempDir = new Path(job.get(hadoop.tmp.dir, /tmp) + /segread- + new java.util.Random().nextInt()); fs.delete(tempDir); job.setOutputPath(tempDir);
svn commit: r474763 - /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java
Author: ab Date: Tue Nov 14 04:24:48 2006 New Revision: 474763 URL: http://svn.apache.org/viewvc?view=revrev=474763 Log: NUTCH-401: use mapred.temp.dir instead of hardcoded /tmp. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java?view=diffrev=474763r1=474762r2=474763 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/segment/SegmentReader.java Tue Nov 14 04:24:48 2006 @@ -185,7 +185,7 @@ job.setReducerClass(SegmentReader.class); -Path tempDir = new Path(/tmp/segread- + new java.util.Random().nextInt()); +Path tempDir = new Path(job.get(mapred.temp.dir, /tmp) + /segread- + new java.util.Random().nextInt()); fs.delete(tempDir); job.setOutputPath(tempDir);
svn commit: r474934 - /lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
Author: ab Date: Tue Nov 14 11:38:06 2006 New Revision: 474934 URL: http://svn.apache.org/viewvc?view=revrev=474934 Log: Add an ObjectWritable decorator. Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (with props) Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?view=autorev=474934 == --- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Tue Nov 14 11:38:06 2006 @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.metadata; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.ObjectWritable; + +/** + * This is a simple decorator that adds metadata to any Object-s that can be + * serialized by ttObjectWritable/tt. This is useful when data needs to be + * temporarily enriched during processing, but this + * temporary metadata doesn't need to be permanently stored after the job is done. + * + * @author Andrzej Bialecki + */ +public class MetaWrapper extends ObjectWritable { + private Metadata metadata; + + public MetaWrapper() { +super(); +metadata = new Metadata(); + } + + public MetaWrapper(Object object, Configuration conf) { +super(object); +metadata = new Metadata(); +setConf(conf); + } + + public MetaWrapper(Metadata metadata, Object object, Configuration conf) { +super(object); +if (metadata == null) metadata = new Metadata(); +this.metadata = metadata; +setConf(conf); + } + + /** + * Get all metadata. + */ + public Metadata getMetadata() { +return metadata; + } + + /** + * Add metadata. See [EMAIL PROTECTED] Metadata#add(String, String)} for more information. + * @param name metadata name + * @param value metadata value + */ + public void addMeta(String name, String value) { +metadata.add(name, value); + } + + /** + * Set metadata. See [EMAIL PROTECTED] Metadata#set(String, String)} for more information. + * @param name + * @param value + */ + public void setMeta(String name, String value) { +metadata.set(name, value); + } + + /** + * Get metadata. See [EMAIL PROTECTED] Metadata#get(String)} for more information. + * @param name + * @return metadata value + */ + public String getMeta(String name) { +return metadata.get(name); + } + + /** + * Get multiple metadata. See [EMAIL PROTECTED] Metadata#getValues(String)} for more information. + * @param name + * @return multiple values + */ + public String[] getMetaValues(String name) { +return metadata.getValues(name); + } + + public void readFields(DataInput in) throws IOException { +super.readFields(in); +metadata = new Metadata(); +metadata.readFields(in); + } + + public void write(DataOutput out) throws IOException { +super.write(out); +metadata.write(out); + } +} \ No newline at end of file Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java -- svn:eol-style = native
svn commit: r469662 - /lucene/nutch/trunk/CHANGES.txt
Author: ab Date: Tue Oct 31 13:36:01 2006 New Revision: 469662 URL: http://svn.apache.org/viewvc?view=revrev=469662 Log: Update. Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diffrev=469662r1=469661r2=469662 == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Oct 31 13:36:01 2006 @@ -66,6 +66,9 @@ 20. NUTCH-379 - ParseUtil does not pass through the content's URL to the ParserFactory (Chris A. Mattmann via siren) +21. NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one +partition. (ab) + Release 0.8 - 2006-07-25
svn commit: r469667 - in /lucene/nutch/branches/branch-0.8: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Tue Oct 31 13:46:26 2006 New Revision: 469667 URL: http://svn.apache.org/viewvc?view=revrev=469667 Log: NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one partition. Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diffrev=469667r1=469666r2=469667 == --- lucene/nutch/branches/branch-0.8/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.8/CHANGES.txt Tue Oct 31 13:46:26 2006 @@ -8,6 +8,9 @@ 2. NUTCH-379 - ParseUtil does not pass through the content's URL to the ParserFactory (Chris A. Mattmann via siren) + 3. NUTCH-361, NUTCH-136 - When jobtracker is 'local' generate only one +partition. (ab) + Release 0.8.1 - 2006-09-24 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java?view=diffrev=469667r1=469666r2=469667 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java Tue Oct 31 13:46:26 2006 @@ -299,6 +299,12 @@ numLists = job.getNumMapTasks();// a partition per fetch task } +if (local.equals(job.get(mapred.job.tracker)) numLists != 1) { + // override + LOG.info(Generator: jobtracker is 'local', generating exactly one partition.); + numLists = 1; +} + job.setLong(crawl.gen.curTime, curTime); job.setLong(crawl.topN, topN);
svn commit: r468673 - /lucene/nutch/branches/branch-0.8/build.xml
Author: ab Date: Sat Oct 28 03:32:44 2006 New Revision: 468673 URL: http://svn.apache.org/viewvc?view=revrev=468673 Log: Fix NUTCH-394. Modified: lucene/nutch/branches/branch-0.8/build.xml Modified: lucene/nutch/branches/branch-0.8/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/build.xml?view=diffrev=468673r1=468672r2=468673 == --- lucene/nutch/branches/branch-0.8/build.xml (original) +++ lucene/nutch/branches/branch-0.8/build.xml Sat Oct 28 03:32:44 2006 @@ -164,6 +164,7 @@ include name=hadoop-*.jar/ include name=dom4j-*.jar/ include name=xerces-*.jar/ +include name=commons-cli-*.jar/ include name=commons-lang-*.jar/ include name=commons-logging-*.jar/ include name=log4j-*.jar/
svn commit: r454297 - /lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
Author: ab Date: Mon Oct 9 00:13:46 2006 New Revision: 454297 URL: http://svn.apache.org/viewvc?view=revrev=454297 Log: Fix NPE when document properties are null. Reported by Trym Asserson. Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diffrev=454297r1=454296r2=454297 == --- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Oct 9 00:13:46 2006 @@ -85,15 +85,17 @@ } catch (Exception e) { return new ParseStatus(ParseStatus.FAILED, - Can't be handled as micrsosoft document. + e) + Can't be handled as Microsoft document. + e) .getEmptyParse(this.conf); } // collect meta data Metadata metadata = new Metadata(); -title = properties.getProperty(DublinCore.TITLE); -properties.remove(DublinCore.TITLE); -metadata.setAll(properties); +if (properties != null) { + title = properties.getProperty(DublinCore.TITLE); + properties.remove(DublinCore.TITLE); + metadata.setAll(properties); +} if (text == null) { text = ; } if (title == null) { title = ; }
svn commit: r454298 - /lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
Author: ab Date: Mon Oct 9 00:22:00 2006 New Revision: 454298 URL: http://svn.apache.org/viewvc?view=revrev=454298 Log: Fix NPE when document properties are null. Reported by Trym Asserson. Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diffrev=454298r1=454297r2=454298 == --- lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original) +++ lucene/nutch/branches/branch-0.8/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Mon Oct 9 00:22:00 2006 @@ -85,15 +85,17 @@ } catch (Exception e) { return new ParseStatus(ParseStatus.FAILED, - Can't be handled as micrsosoft document. + e) + Can't be handled as Microsoft document. + e) .getEmptyParse(this.conf); } // collect meta data Metadata metadata = new Metadata(); -title = properties.getProperty(DublinCore.TITLE); -properties.remove(DublinCore.TITLE); -metadata.setAll(properties); +if (properties != null) { + title = properties.getProperty(DublinCore.TITLE); + properties.remove(DublinCore.TITLE); + metadata.setAll(properties); +} if (text == null) { text = ; } if (title == null) { title = ; }
svn commit: r450799 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDb.java src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: ab Date: Thu Sep 28 03:48:25 2006 New Revision: 450799 URL: http://svn.apache.org/viewvc?view=revrev=450799 Log: Bring back the '-noAdditions' option. This is useful for running constrained crawls, where the complete list of URLs is known in advance. Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diffrev=450799r1=450798r2=450799 == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Sep 28 03:48:25 2006 @@ -237,6 +237,15 @@ /property property + namedb.update.additions.allowed/name + valuetrue/value + descriptionIf true, updatedb will add newly discovered URLs, if false + only already existing URLs in the CrawlDb will be updated and no new + URLs will be added. + /description +/property + +property namedb.ignore.internal.links/name valuetrue/value descriptionIf true, when adding new links to a page, links from Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diffrev=450799r1=450798r2=450799 == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Sep 28 03:48:25 2006 @@ -38,6 +38,7 @@ * crawldb accordingly. */ public class CrawlDb extends ToolBase { + public static final String CRAWLDB_ADDITIONS_ALLOWED = db.update.additions.allowed; public static final Log LOG = LogFactory.getLog(CrawlDb.class); @@ -50,16 +51,23 @@ } public void update(Path crawlDb, Path segment, boolean normalize, boolean filter) throws IOException { +boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); +update(crawlDb, segment, normalize, filter, additionsAllowed); + } + + public void update(Path crawlDb, Path segment, boolean normalize, boolean filter, boolean additionsAllowed) throws IOException { if (LOG.isInfoEnabled()) { LOG.info(CrawlDb update: starting); LOG.info(CrawlDb update: db: + crawlDb); LOG.info(CrawlDb update: segment: + segment); + LOG.info(CrawlDb update: additions allowed: + additionsAllowed); LOG.info(CrawlDb update: URL normalizing: + normalize); LOG.info(CrawlDb update: URL filtering: + filter); } JobConf job = CrawlDb.createJob(getConf(), crawlDb); +job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME)); @@ -122,26 +130,30 @@ public int run(String[] args) throws Exception { if (args.length 2) { - System.err.println(Usage: CrawlDb crawldb segment [-normalize] [-filter]); + System.err.println(Usage: CrawlDb crawldb segment [-normalize] [-filter] [-noAdditions]); System.err.println(\tcrawldb\tCrawlDb to update); System.err.println(\tsegment\tsegment name to update from); System.err.println(\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)); System.err.println(\t-filter\tuse URLFilters on urls in CrawlDb and segment); + System.err.println(\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs); return -1; } boolean normalize = false; boolean filter = false; +boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); if (args.length 2) { for (int i = 2; i args.length; i++) { if (args[i].equals(-normalize)) { normalize = true; } else if (args[i].equals(-filter)) { filter = true; +} else if (args[i].equals(-noAdditions)) { + additionsAllowed = false; } } } try { - update(new Path(args[0]), new Path(args[1]), normalize, filter); + update(new Path(args[0]), new Path(args[1]), normalize, filter, additionsAllowed); return 0; } catch (Exception e) { LOG.fatal(CrawlDb update: + StringUtils.stringifyException(e)); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diffrev=450799r1=450798r2=450799 == --- lucene/nutch/trunk/src/java/org/apache/nutch
svn commit: r449738 - /lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: ab Date: Mon Sep 25 09:58:49 2006 New Revision: 449738 URL: http://svn.apache.org/viewvc?view=revrev=449738 Log: Don't create dummy Content (throws NPE), just pass null. Reported by Richard Braman. Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diffrev=449738r1=449737r2=449738 == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Sep 25 09:58:49 2006 @@ -193,9 +193,7 @@ // skip this page, otherwise the thread would block for too long. LOGGER.info(Skipping: + u + exceeds fetcher.max.crawl.delay, max= + (maxCrawlDelay / 1000) + , Crawl-Delay= + (delay / 1000)); -Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT, -null, null, this.conf); -return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK); +return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK); } String host; try {
svn commit: r449742 - /lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: ab Date: Mon Sep 25 10:05:22 2006 New Revision: 449742 URL: http://svn.apache.org/viewvc?view=revrev=449742 Log: Don't create dummy Content (throws NPE), just pass null. Reported by Richard Braman. Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diffrev=449742r1=449741r2=449742 == --- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Sep 25 10:05:22 2006 @@ -193,9 +193,7 @@ // skip this page, otherwise the thread would block for too long. LOGGER.info(Skipping: + u + exceeds fetcher.max.crawl.delay, max= + (maxCrawlDelay / 1000) + , Crawl-Delay= + (delay / 1000)); -Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT, -null, null, this.conf); -return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK); +return new ProtocolOutput(null, ProtocolStatus.STATUS_WOULDBLOCK); } String host; try {
svn commit: r449765 - /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Author: ab Date: Mon Sep 25 11:14:31 2006 New Revision: 449765 URL: http://svn.apache.org/viewvc?view=revrev=449765 Log: Catch exception on invalid urls, and continue collecting valid ones. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diffrev=449765r1=449764r2=449765 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java Mon Sep 25 11:14:31 2006 @@ -16,6 +16,7 @@ package org.apache.nutch.parse; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; @@ -108,7 +109,13 @@ } result = matcher.getMatch(); url = result.group(0); -outlinks.add(new Outlink(url, anchor, conf)); +url = result.group(0); +try { + Outlink outlink = new Outlink(url, anchor, conf); + outlinks.add(new Outlink(url, anchor, conf)); +} catch (MalformedURLException mue) { + LOG.warn(Invalid url: ' + url + ', skipping.); +} } } catch (Exception ex) { // if the matcher fails (perhaps a malformed URL) we just log it and move on
svn commit: r449294 - in /lucene/nutch/branches/branch-0.8: ./ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
Author: ab Date: Sat Sep 23 12:45:48 2006 New Revision: 449294 URL: http://svn.apache.org/viewvc?view=revrev=449294 Log: NUTCH-350: Urls blocked by http.max.delays incorrectly marked as GONE. Added: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (with props) Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diffrev=449294r1=449293r2=449294 == --- lucene/nutch/branches/branch-0.8/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.8/CHANGES.txt Sat Sep 23 12:45:48 2006 @@ -38,6 +38,9 @@ 12. NUTCH-337 - Fetcher ignores the fetcher.parse value (Stefan Groschupf via ab) + +13. NUTCH-350 - Urls blocked by http.max.delays incorrectly marked as GONE +(Stefan Groschupf via ab) Release 0.8 - 2006-07-25 Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diffrev=449294r1=449293r2=449294 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Sep 23 12:45:48 2006 @@ -188,18 +188,24 @@ } break; + // failures - increase the retry counter case ProtocolStatus.EXCEPTION: logError(url, status.getMessage()); + /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1); + /* FALLTHROUGH */ + // intermittent blocking - retry without increasing the counter + case ProtocolStatus.WOULDBLOCK: + case ProtocolStatus.BLOCKED: output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY); break; + // permanent failures case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: - case ProtocolStatus.WOULDBLOCK: case ProtocolStatus.NOTMODIFIED: output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE); break; Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java?view=diffrev=449294r1=449293r2=449294 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java Sat Sep 23 12:45:48 2006 @@ -64,7 +64,9 @@ * The expected number of milliseconds to wait before retry may be provided * in args. */ public static final int WOULDBLOCK = 22; - + /** Thread was blocked http.max.delays times during fetching. */ + public static final int BLOCKED = 23; + // Useful static instances for status codes that don't usually require any // additional arguments. public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS); @@ -77,6 +79,7 @@ public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING); public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED); public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK); + public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus(BLOCKED); private int code; private long lastModified; @@ -99,6 +102,7 @@ codeToName.put(new Integer(NOTFETCHING), notfetching); codeToName.put(new Integer(NOTMODIFIED), notmodified); codeToName.put(new Integer(WOULDBLOCK), wouldblock); +codeToName.put(new Integer(BLOCKED), blocked); } public ProtocolStatus() { Added: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib
svn commit: r449088 [2/2] - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ src/plugin
Added: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?view=autorev=449088 == --- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (added) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Fri Sep 22 14:05:33 2006 @@ -0,0 +1,176 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexURLNormalizer extends TestCase { + private static final Log LOG = LogFactory.getLog(TestRegexURLNormalizer.class); + + private RegexURLNormalizer normalizer; + private Configuration conf; + private HashMap testData = new HashMap(); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty(test.data, .); + // Make sure sample files are copied to test.data as specified in + // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. + + public TestRegexURLNormalizer(String name) throws IOException { +super(name); +normalizer = new RegexURLNormalizer(); +conf = NutchConfiguration.create(); +normalizer.setConf(conf); +File[] configs = new File(sampleDir).listFiles(new FileFilter() { + public boolean accept(File f) { +if (f.getName().endsWith(.xml) f.getName().startsWith(regex-normalize-)) + return true; +return false; + } +}); +for (int i = 0; i configs.length; i++) { + try { +FileInputStream fis = new FileInputStream(configs[i]); +String cname = configs[i].getName(); +cname = cname.substring(16, cname.indexOf(.xml)); +normalizer.setConfiguration(fis, cname); +NormalizedURL[] urls = readTestFile(cname); +testData.put(cname, urls); + } catch (Exception e) { +LOG.warn(Could load config from ' + configs[i] + ': + e.toString()); + } +} + } + + public void testNormalizerDefault() throws Exception { +normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT), +URLNormalizers.SCOPE_DEFAULT); + } + + public void testNormalizerScope() throws Exception { +Iterator it = testData.keySet().iterator(); +while (it.hasNext()) { + String scope = (String)it.next(); + normalizeTest((NormalizedURL[])testData.get(scope), scope); +} + } + + private void normalizeTest(NormalizedURL[] urls, String scope) throws Exception { +for (int i = 0; i urls.length; i++) { + assertEquals(urls[i].expectedURL, + normalizer.normalize(urls[i].url, scope)); +} + } + + private void bench(int loops, String scope) { +long start = System.currentTimeMillis(); +try { + NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); + if (expected == null) return; + for (int i = 0; i loops; i++) { +normalizeTest(expected, scope); + } +} catch (Exception e) { + fail(e.toString()); +} +LOG.info(bench time ( + loops + ) + + (System.currentTimeMillis() - start) + ms); + } + + private static class NormalizedURL { +String url; +String expectedURL; + +public NormalizedURL(String line) { + String[] fields = line.split(\\s+); + url = fields[0]; + expectedURL = fields[1]; +} + } + + private NormalizedURL[] readTestFile(String scope)
svn commit: r447359 - /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: ab Date: Mon Sep 18 03:43:07 2006 New Revision: 447359 URL: http://svn.apache.org/viewvc?view=revrev=447359 Log: Fix an NPE when using searcher.max.hits, but NOT using time limit. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?view=diffrev=447359r1=447358r2=447359 == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Mon Sep 18 03:43:07 2006 @@ -104,8 +104,10 @@ super(numHits); this.maxHits = maxHits; this.maxTicks = maxTicks; - this.timer = timer; - this.startTicks = timer.timeCounter; + if (timer != null) { +this.timer = timer; +this.startTicks = timer.timeCounter; + } } public void collect(int doc, float score) {
svn commit: r432674 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
Author: ab Date: Fri Aug 18 11:48:29 2006 New Revision: 432674 URL: http://svn.apache.org/viewvc?rev=432674view=rev Log: NUTCH-341 - if -workingdir is specified, always create a unique subdir. Also, use unique directory names to allow multiple IndexMergers to run simultaneously. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=432674r1=432673r2=432674view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Fri Aug 18 11:48:29 2006 @@ -118,13 +118,13 @@ // Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); -Path workDir = new Path(indexmerger); +Path workDir = new Path(indexmerger- + System.currentTimeMillis()); List indexDirs = new ArrayList(); int i = 0; if (-workingdir.equals(args[i])) { i++; - workDir = new Path(args[i++]); + workDir = new Path(args[i++], indexmerger- + System.currentTimeMillis()); } Path outputIndex = new Path(args[i++]);
svn commit: r432675 - /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java
Author: ab Date: Fri Aug 18 11:50:00 2006 New Revision: 432675 URL: http://svn.apache.org/viewvc?rev=432675view=rev Log: NUTCH-341 - if -workingdir is specified, always create a unique subdir. Also, use unique directory names to allow multiple IndexMergers to run simultaneously. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=432675r1=432674r2=432675view=diff == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/indexer/IndexMerger.java Fri Aug 18 11:50:00 2006 @@ -118,13 +118,13 @@ // Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); -Path workDir = new Path(indexmerger); +Path workDir = new Path(indexmerger- + System.currentTimeMillis()); List indexDirs = new ArrayList(); int i = 0; if (-workingdir.equals(args[i])) { i++; - workDir = new Path(args[i++]); + workDir = new Path(args[i++], indexmerger- + System.currentTimeMillis()); } Path outputIndex = new Path(args[i++]);
svn commit: r432254 - /lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: ab Date: Thu Aug 17 07:53:54 2006 New Revision: 432254 URL: http://svn.apache.org/viewvc?rev=432254view=rev Log: Move toLowerCase where it actually matters. Fix some whitespace. Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=432254r1=432253r2=432254view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Thu Aug 17 07:53:54 2006 @@ -327,8 +327,8 @@ host = url.getHost(); if (host == null) throw new HttpException(Unknown host for url: + url); + host = host.toLowerCase(); } -host = host.toLowerCase(); int delays = 0; while (true) { @@ -389,8 +389,8 @@ private static void cleanExpiredServerBlocks() { synchronized (BLOCKED_ADDR_TO_TIME) { - for(int i = BLOCKED_ADDR_QUEUE.size()-1; i = 0; i--){ - String host = (String) BLOCKED_ADDR_QUEUE.get(i); + for (int i = BLOCKED_ADDR_QUEUE.size() - 1; i = 0; i--) { +String host = (String) BLOCKED_ADDR_QUEUE.get(i); long time = ((Long) BLOCKED_ADDR_TO_TIME.get(host)).longValue(); if (time = System.currentTimeMillis()) { BLOCKED_ADDR_TO_TIME.remove(host);
svn commit: r432256 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Thu Aug 17 07:56:35 2006 New Revision: 432256 URL: http://svn.apache.org/viewvc?rev=432256view=rev Log: Apply patch in NUTCH-348 - Generator used the lowest score instead of the highest. Contributed by Chris Schneider and Stefan Groschupf. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=432256r1=432255r2=432256view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Aug 17 07:56:35 2006 @@ -59,7 +59,11 @@ public void write(DataOutput out) throws IOException { url.write(out); datum.write(out); -} +} + +public String toString() { + return url= + url.toString() + , datum= + datum.toString(); +} } /** Selects entries due for fetch. */ @@ -118,7 +122,7 @@ LOG.warn(Couldn't filter generatorSortValue for + key + : + sfe); } } - // sort by decreasing score + // sort by decreasing score, using DecreasingFloatComparator sortValue.set(sort); entry.datum = crawlDatum; entry.url = (UTF8)key; @@ -196,6 +200,20 @@ } + public static class DecreasingFloatComparator extends WritableComparator { + +public DecreasingFloatComparator() { + super(FloatWritable.class); +} + +/** Compares two FloatWritables decreasing. */ +public int compare(WritableComparable o1, WritableComparable o2) { + float thisValue = ((FloatWritable) o1).get(); + float thatValue = ((FloatWritable) o2).get(); + return (thisValuethatValue ? 1 : (thisValue == thatValue ? 0 : -1)); +} + } + public static class SelectorInverseMapper extends MapReduceBase implements Mapper { public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { @@ -270,7 +288,7 @@ if (LOG.isInfoEnabled()) { LOG.info(Generator: starting); LOG.info(Generator: segment: + segment); - LOG.info(Generator: Selecting most-linked urls due for fetch.); + LOG.info(Generator: Selecting best-scoring urls due for fetch.); } // map to inverted subset due for fetch, sort by link count @@ -296,6 +314,7 @@ job.setOutputPath(tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); +job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); JobClient.runJob(job);
svn commit: r432287 - /lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java
Author: ab Date: Thu Aug 17 09:35:35 2006 New Revision: 432287 URL: http://svn.apache.org/viewvc?rev=432287view=rev Log: Apply patch in NUTCH-348 - Generator used the lowest score instead of the highest. Contributed by Chris Schneider and Stefan Groschupf. Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java?rev=432287r1=432286r2=432287view=diff == --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Generator.java Thu Aug 17 09:35:35 2006 @@ -59,7 +59,11 @@ public void write(DataOutput out) throws IOException { url.write(out); datum.write(out); -} +} + +public String toString() { + return url= + url.toString() + , datum= + datum.toString(); +} } /** Selects entries due for fetch. */ @@ -118,7 +122,7 @@ LOG.warn(Couldn't filter generatorSortValue for + key + : + sfe); } } - // sort by decreasing score + // sort by decreasing score, using DecreasingFloatComparator sortValue.set(sort); entry.datum = crawlDatum; entry.url = (UTF8)key; @@ -196,6 +200,20 @@ } + public static class DecreasingFloatComparator extends WritableComparator { + +public DecreasingFloatComparator() { + super(FloatWritable.class); +} + +/** Compares two FloatWritables decreasing. */ +public int compare(WritableComparable o1, WritableComparable o2) { + float thisValue = ((FloatWritable) o1).get(); + float thatValue = ((FloatWritable) o2).get(); + return (thisValuethatValue ? 1 : (thisValue == thatValue ? 0 : -1)); +} + } + public static class SelectorInverseMapper extends MapReduceBase implements Mapper { public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { @@ -270,7 +288,7 @@ if (LOG.isInfoEnabled()) { LOG.info(Generator: starting); LOG.info(Generator: segment: + segment); - LOG.info(Generator: Selecting most-linked urls due for fetch.); + LOG.info(Generator: Selecting best-scoring urls due for fetch.); } // map to inverted subset due for fetch, sort by link count @@ -296,6 +314,7 @@ job.setOutputPath(tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); +job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); JobClient.runJob(job);