svn commit: r931098 - in /lucene/nutch/trunk: ./ conf/ lib/ src/plugin/ src/plugin/parse-tika/ src/plugin/parse-tika/lib/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/
Author: jnioche Date: Tue Apr 6 11:38:26 2010 New Revision: 931098 URL: http://svn.apache.org/viewvc?rev=931098view=rev Log: NUTCH-810 Upgraded to Tika 0.7 Added: lucene/nutch/trunk/lib/tika-core-0.7.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar (with props) Removed: lucene/nutch/trunk/lib/tika-core-0.6.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/tika-mimetypes.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098r1=931097r2=931098view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 6 11:38:26 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-810 Upgrade to Tika 0.7 (jnioche) + * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098r1=931097r2=931098view=diff == --- lucene/nutch/trunk/conf/tika-mimetypes.xml (original) +++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr 6 11:38:26 2010 @@ -2198,7 +2198,11 @@ mime-type type=application/x-cpio magic priority=50 - match value=070707 type=host16 offset=0/ + match value=070707 type=little16 offset=0/ + match value=070707 type=big16 offset=0/ + match value=070707 type=string offset=0/ + match value=070701 type=string offset=0/ + match value=070702 type=string offset=0/ /magic glob pattern=*.cpio/ /mime-type @@ -3551,7 +3555,13 @@ bad HTML, unfortunately. -- root-XML localName=html/ +root-XML localName=HTML/ root-XML localName=link/ +root-XML localName=LINK/ +root-XML localName=body/ +root-XML localName=BODY/ +root-XML localName=p/ +root-XML localName=P/ magic priority=50 match value=lt;!DOCTYPE HTML type=string offset=0:64/ match value=lt;!doctype html type=string offset=0:64/ Added: lucene/nutch/trunk/lib/tika-core-0.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098r1=931097r2=931098view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr 6 11:38:26 2010 @@ -32,8 +32,8 @@ ant dir=index-basic target=deploy/ ant dir=index-anchor target=deploy/ ant dir=index-more target=deploy/ -ant dir=field-basic target=deploy/ -ant dir=field-boost target=deploy/ + ant dir=field-basic target=deploy/ + ant dir=field-boost target=deploy/ ant dir=languageidentifier target=deploy/ ant dir=lib-http target=deploy/ ant dir=lib-jakarta-poi target=deploy/ @@ -65,12 +65,12 @@ ant dir=query-basic target=deploy/ ant dir=query-more target=deploy/ ant dir=query-site target=deploy/ -ant dir=query-custom target=deploy/ + ant dir=query-custom target=deploy/ ant dir=query-url target=deploy/ ant dir=response-json target=deploy/ ant dir
svn commit: r926003 - in /lucene/nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/pro
Author: jnioche Date: Mon Mar 22 09:00:11 2010 New Revision: 926003 URL: http://svn.apache.org/viewvc?rev=926003view=rev Log: NUTCH-740 Configuration option to override default language for fetched pages Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003r1=926002r2=926003view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) + * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003r1=926002r2=926003view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010 @@ -228,6 +228,15 @@ /description /property +property + namehttp.accept.language/name + valueen-us,en-gb,en;q=0.7,*;q=0.3/value + descriptionValue of the Accept-Language request header field. + This allows selecting non-English language as default one to retrieve. + It is a useful setting for search engines build for certain national group. + /description +/property + !-- FTP properties -- property Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003r1=926002r2=926003view=diff == --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Mar 22 09:00:11 2010 @@ -93,6 +93,8 @@ public abstract class HttpBase implement http://lucene.apache.org/nutch/bot.html;, nutch-ag...@lucene.apache.org); + /** The Accept-Language request header value. */ + protected String acceptLanguage = en-us,en-gb,en;q=0.7,*;q=0.3; /** * Maps from host to a Long naming the time it should be unblocked. @@ -162,6 +164,7 @@ public abstract class HttpBase implement this.maxThreadsPerHost = conf.getInt(fetcher.threads.per.host, 1); this.userAgent = getAgentString(conf.get(http.agent.name), conf.get(http.agent.version), conf .get(http.agent.description), conf.get(http.agent.url), conf.get(http.agent.email)); +this.acceptLanguage = conf.get(http.accept.language, acceptLanguage); this.serverDelay = (long) (conf.getFloat(fetcher.server.delay, 1.0f) * 1000); this.maxCrawlDelay = (long)(conf.getInt(fetcher.max.crawl.delay, -1) * 1000); // backward-compatible default setting @@ -326,6 +329,13 @@ public abstract class HttpBase implement return userAgent; } + /** Value of Accept-Language request header sent by Nutch. + * @return The value of the header Accept-Language header. + */ + public String getAcceptLanguage() { + return acceptLanguage; + } + public boolean getUseHttp11() { return useHttp11; } @@ -470,6 +480,7 @@ public abstract class HttpBase implement logger.info(http.timeout = + timeout); logger.info(http.content.limit = + maxContent); logger.info(http.agent = + userAgent); + logger.info(http.accept.language = + acceptLanguage); logger.info(Protocol.CHECK_BLOCKING + = + checkBlocking); logger.info(Protocol.CHECK_ROBOTS + = + checkRobots); if (checkBlocking) { Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003r1=926002r2=926003view=diff == --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java
svn commit: r926155 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/tools/ src/test/org/apache/nutch/crawl/ src/test/org/ap
Author: jnioche Date: Mon Mar 22 16:19:12 2010 New Revision: 926155 URL: http://svn.apache.org/viewvc?rev=926155view=rev Log: NUTCH-762 : Generator can generate several segments in one parse of the crawlDB Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java Removed: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926155r1=926154r2=926155view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 16:19:12 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) + * NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926155r1=926154r2=926155view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 16:19:12 2010 @@ -514,24 +514,21 @@ !-- generate properties -- property - namegenerate.max.per.host/name + namegenerate.max.count/name value-1/value - descriptionThe maximum number of urls per host in a single - fetchlist. -1 if unlimited./description + descriptionThe maximum number of urls in a single + fetchlist. -1 if unlimited. The urls are counted according + to the value of the parameter generator.count.mode. + /description /property property - namegenerate.max.per.host.by.ip/name - valuefalse/value - descriptionIf false, same host names are counted. If true, - hosts' IP addresses are resolved and the same IP-s are counted. - - -+-+-+- WARNING !!! -+-+-+- - When set to true, Generator will create a lot of DNS lookup - requests, rapidly. This may cause a DOS attack on - remote DNS servers, not to mention increased external traffic - and latency. For these reasons when using this option it is - required that a local caching DNS be used./description + namegenerate.count.mode/name + valuehost/value + descriptionDetermines how the URLs are counted for generator.max.count. + Default value is 'host' but can be 'domain'. Note that we do not count + per IP in the new version of the Generator. + /description /property property @@ -545,6 +542,34 @@ updatedb will generate identical fetchlists./description /property +property + namegenerate.max.per.host/name + value-1/value + description(Deprecated). Use generate.max.count and generate.count.mode instead. + The maximum number of urls per host in a single + fetchlist. -1 if unlimited./description +/property + +!-- urlpartitioner properties -- +property + namepartition.url.mode/name + valuebyHost/value + descriptionDetermines how to partition URLs. Default value is 'byHost', + also takes 'byDomain' or 'byIP'. + /description +/property + +property + namecrawl.gen.delay/name + value60480/value + description + This value, expressed in days, defines how long we should keep the lock on records + in CrawlDb that were just selected for fetching. If these records are not updated + in the meantime, the lock is canceled, i.e. the become eligible for selecting. + Default value of this is 7 days. + /description +/property + !-- fetcher properties -- property Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=926155r1=926154r2=926155view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 22 16:19:12 2010 @@ -124,17 +124,17 @@ public class Crawl { injector.inject(crawlDb, rootUrlDir); int i; for (i = 0; i depth; i++) { // generate new segment - Path segment = generator.generate(crawlDb, segments, -1, topN, System + Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
svn commit: r926163 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: jnioche Date: Mon Mar 22 16:29:30 2010 New Revision: 926163 URL: http://svn.apache.org/viewvc?rev=926163view=rev Log: fixed NPE introduced in NUTCH-762 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=926163r1=926162r2=926163view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Mar 22 16:29:30 2010 @@ -480,7 +480,7 @@ public class Generator extends Configure LOG.info(Generator: topN: + topN); } -if (getConf().get(GENERATE_MAX_PER_HOST_BY_IP).equals(true)){ +if (true.equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){ LOG.info(Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead); }
svn commit: r921831 - in /lucene/nutch/trunk: ./ lib/
Author: jnioche Date: Thu Mar 11 13:06:12 2010 New Revision: 921831 URL: http://svn.apache.org/viewvc?rev=921831view=rev Log: NUTCH-798 : Upgrade to SOLR1.4 and its dependencies Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar (with props) lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar (with props) lucene/nutch/trunk/lib/commons-httpclient-3.1.jar (with props) lucene/nutch/trunk/lib/commons-io-1.4.jar (with props) lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar (with props) lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar (with props) lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar (with props) lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar (with props) Removed: lucene/nutch/trunk/lib/apache-solr-common-1.3.0.jar lucene/nutch/trunk/lib/apache-solr-solrj-1.3.0.jar lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar lucene/nutch/trunk/lib/slf4j-api-1.4.3.jar Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921831r1=921830r2=921831view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:06:12 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) + * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) * NUTCH-782 Ability to order htmlparsefilters (jnioche) Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.1.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/commons-io-1.4.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-io-1.4.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-io-1.4.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar?rev=921831view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar?rev=921831view=auto
svn commit: r921840 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml src/plugin/build.xml src/plugin/parse-mp3/ src/plugin/parse-rtf/
Author: jnioche Date: Thu Mar 11 13:25:44 2010 New Revision: 921840 URL: http://svn.apache.org/viewvc?rev=921840view=rev Log: NUTCH-801 Remove RTF and MP3 parse plugins Removed: lucene/nutch/trunk/src/plugin/parse-mp3/ lucene/nutch/trunk/src/plugin/parse-rtf/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921840r1=921839r2=921840view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:25:44 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche) + * NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche) * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?rev=921840r1=921839r2=921840view=diff == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Mar 11 13:25:44 2010 @@ -124,13 +124,11 @@ /mimeType mimeType name=text/richtext - plugin id=parse-rtf / - plugin id=parse-msword / + plugin id=parse-tika / /mimeType mimeType name=text/rtf - plugin id=parse-rtf / - plugin id=parse-msword / + plugin id=parse-tika / /mimeType mimeType name=text/sgml @@ -198,8 +196,6 @@ alias name=parse-html extension-id=org.apache.nutch.parse.html.HtmlParser / alias name=parse-js extension-id=JSParser / - alias name=parse-mp3 - extension-id=org.apache.nutch.parse.mp3.MP3Parser / alias name=parse-msexcel extension-id=org.apache.nutch.parse.msexcel.MSExcelParser / alias name=parse-mspowerpoint @@ -212,10 +208,8 @@ extension-id=org.apache.nutch.parse.pdf.PdfParser / alias name=parse-rss extension-id=org.apache.nutch.parse.rss.RSSParser / -alias name=feed -extension-id=org.apache.nutch.parse.feed.FeedParser / - alias name=parse-rtf - extension-id=org.apache.nutch.parse.rtf.RTFParseFactory / + alias name=feed + extension-id=org.apache.nutch.parse.feed.FeedParser / alias name=parse-swf extension-id=org.apache.nutch.parse.swf.SWFParser / alias name=parse-text Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=921840r1=921839r2=921840view=diff == --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Thu Mar 11 13:25:44 2010 @@ -52,14 +52,12 @@ ant dir=parse-ext target=deploy/ ant dir=parse-html target=deploy/ ant dir=parse-js target=deploy/ - !-- ant dir=parse-mp3 target=deploy/ -- ant dir=parse-msexcel target=deploy/ ant dir=parse-mspowerpoint target=deploy/ ant dir=parse-msword target=deploy/ ant dir=parse-oo target=deploy/ ant dir=parse-pdf target=deploy/ ant dir=parse-rss target=deploy/ - !-- ant dir=parse-rtf target=deploy/ -- ant dir=parse-swf target=deploy/ ant dir=parse-text target=deploy/ ant dir=parse-tika target=deploy/
svn commit: r919358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrIndexer.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Author: jnioche Date: Fri Mar 5 10:09:08 2010 New Revision: 919358 URL: http://svn.apache.org/viewvc?rev=919358view=rev Log: NUTCH-799 SOLRIndexer to commit once all reducers have finished Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=919358r1=919357r2=919358view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 5 10:09:08 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche) + * NUTCH-782 Ability to order htmlparsefilters (jnioche) * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=919358r1=919357r2=919358view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Fri Mar 5 10:09:08 2010 @@ -37,6 +37,8 @@ import org.apache.nutch.indexer.NutchIndexWriterFactory; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; public class SolrIndexer extends Configured implements Tool { @@ -71,6 +73,12 @@ FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); + // do the commits once and for all the reducers in one go + SolrServer solr = new CommonsHttpSolrServer(solrUrl); + solr.commit(); +} +catch (Exception e){ + LOG.error(e); } finally { FileSystem.get(job).delete(tmp, true); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=919358r1=919357r2=919358view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Fri Mar 5 10:09:08 2010 @@ -74,7 +74,7 @@ solr.add(inputDocs); inputDocs.clear(); } - solr.commit(); + // solr.commit(); } catch (final SolrServerException e) { throw makeIOException(e); }
svn commit: r917557 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/parse/HtmlParseFilters.java
Author: jnioche Date: Mon Mar 1 15:08:05 2010 New Revision: 917557 URL: http://svn.apache.org/viewvc?rev=917557view=rev Log: NUTCH-782: Ability to order htmlparsefilters Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557r1=917556r2=917557view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 1 15:08:05 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-782 Ability to order htmlparsefilters (jnioche) + * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) * NUTCH-790 Some external javadoc links are broken (siren) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557r1=917556r2=917557view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 1 15:08:05 2010 @@ -996,6 +996,18 @@ for most people would be img,script,link./description /property +property + namehtmlparsefilter.order/name + value/value + descriptionThe order by which HTMLParse filters are applied. + If empty, all available HTMLParse filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. + HTMLParse filter ordering MAY have an impact + on end result, as some filters could rely on the metadata generated by a previous filter. + /description +/property !-- urlfilter plugin properties -- Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557r1=917556r2=917557view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Mar 1 15:08:05 2010 @@ -17,6 +17,7 @@ package org.apache.nutch.parse; +import java.util.ArrayList; import java.util.HashMap; import org.apache.nutch.protocol.Content; @@ -30,12 +31,23 @@ public class HtmlParseFilters { private HtmlParseFilter[] htmlParseFilters; + + public static final String HTMLPARSEFILTER_ORDER = htmlparsefilter.order; public HtmlParseFilters(Configuration conf) { +String order = conf.get(HTMLPARSEFILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); this.htmlParseFilters = (HtmlParseFilter[]) objectCache.getObject(HtmlParseFilter.class.getName()); if (htmlParseFilters == null) { -HashMapString, HtmlParseFilter filters = + /* + * If ordered filters are required, prepare array of filters based on + * property + */ + String[] orderedFilters = null; + if (order != null !order.trim().equals()) { +orderedFilters = order.split(\\s+); + } +HashMapString, HtmlParseFilter filterMap = new HashMapString, HtmlParseFilter(); try { ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID); @@ -45,12 +57,31 @@ for (int i = 0; i extensions.length; i++) { Extension extension = extensions[i]; HtmlParseFilter parseFilter = (HtmlParseFilter) extension.getExtensionInstance(); -if (!filters.containsKey(parseFilter.getClass().getName())) { -filters.put(parseFilter.getClass().getName(), parseFilter); +if (!filterMap.containsKey(parseFilter.getClass().getName())) { +filterMap.put(parseFilter.getClass().getName(), parseFilter); } } -HtmlParseFilter[] htmlParseFilters = filters.values().toArray(new HtmlParseFilter[filters.size()]); -objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); +HtmlParseFilter[] htmlParseFilters = filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]); +/* + * If no ordered filters required, just get the filters in an + * indeterminate order + */ +if (orderedFilters == null) { + objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters
svn commit: r910454 - in /lucene/nutch/trunk/src/plugin/languageidentifier/src: java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.j
Author: jnioche Date: Tue Feb 16 10:20:22 2010 New Revision: 910454 URL: http://svn.apache.org/viewvc?rev=910454view=rev Log: NUTCH-794 : Language Identification must use check the parse metadata for language values Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=910454r1=910453r2=910454view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Tue Feb 16 10:20:22 2010 @@ -91,15 +91,33 @@ Parse parse = parseResult.get(content.getUrl()); +String lang = getLanguageFromMetadata(parse.getData().getParseMeta()); +if (lang != null) { + parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); + return parseResult; +} + // Trying to find the document's language LanguageParser parser = new LanguageParser(doc); -String lang = parser.getLanguage(); +lang = parser.getLanguage(); if (lang != null) { parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); } return parseResult; } + + // Check in the metadata whether the language has already been stored there by Tika + private static String getLanguageFromMetadata(Metadata parseMD){ +// dublin core +String lang = parseMD.get(dc.language); +if (lang!=null) return lang; +// meta content-language +lang = parseMD.get(content-language); +if (lang!=null) return lang; +// lang attribute +return parseMD.get(lang); + } static class LanguageParser { Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=910454r1=910453r2=910454view=diff == --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Feb 16 10:20:22 2010 @@ -40,7 +40,8 @@ htmlheadmeta http-equiv=\content-language\ content=\en\titledocument 2 title/headbodythis is english/body/html, htmlheadmeta name=\dc.language\ content=\en\titledocument 3 title/headbodythis is english/body/html }; - String metalanguages[] = { fi, en, en }; + // NUTCH-794 : temporarily replaced fi and en with null + String metalanguages[] = { null, en, en }; /** * Test parsing of language identifiers from html
svn commit: r906907 - in /lucene/nutch/trunk: CHANGES.txt conf/domain-suffixes.xml
Author: jnioche Date: Fri Feb 5 11:52:57 2010 New Revision: 906907 URL: http://svn.apache.org/viewvc?rev=906907view=rev Log: NUTCH-786 Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/domain-suffixes.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=906907r1=906906r2=906907view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Feb 5 11:52:57 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-786 Improvement to the list of suffix domains (jnioche) + * NUTCH-775 Enhance searcher interface (siren) * NUTCH-781 Update Tika to v0.6 (jnioche) Modified: lucene/nutch/trunk/conf/domain-suffixes.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-suffixes.xml?rev=906907r1=906906r2=906907view=diff == --- lucene/nutch/trunk/conf/domain-suffixes.xml (original) +++ lucene/nutch/trunk/conf/domain-suffixes.xml Fri Feb 5 11:52:57 2010 @@ -1744,6 +1744,16 @@ suffix domain=retina.ar / suffix domain=uba.ar / +suffix domain=com.ar / +suffix domain=edu.ar / +suffix domain=gob.ar / +suffix domain=gov.ar / +suffix domain=int.ar / +suffix domain=mil.ar / +suffix domain=net.ar / +suffix domain=org.ar / +suffix domain=tur.ar / + !-- arpa : http://en.wikipedia.org/wiki/.arpa-- suffix domain=e164.arpa / suffix domain=in-addr.arpa / @@ -1955,6 +1965,14 @@ !-- co : http://en.wikipedia.org/wiki/.co-- +suffix domain=com.co / +suffix domain=org.co / +suffix domain=edu.co / +suffix domain=gov.co / +suffix domain=net.co / +suffix domain=mil.co / +suffix domain=nom.co / + !-- com : http://en.wikipedia.org/wiki/.com-- !-- coop : http://en.wikipedia.org/wiki/.coop-- @@ -2215,9 +2233,26 @@ !-- id : http://en.wikipedia.org/wiki/.id-- +suffix domain=ac.id / +suffix domain=co.id / +suffix domain=net.id / +suffix domain=or.id / +suffix domain=web.id / +suffix domain=sch.id / +suffix domain=mil.id / +suffix domain=go.id / + !-- ie : http://en.wikipedia.org/wiki/.ie-- !-- il : http://en.wikipedia.org/wiki/.il-- + suffix domain=ac.il / + suffix domain=co.il / + suffix domain=org.il / + suffix domain=net.il / + suffix domain=k12.il / + suffix domain=gov.il / + suffix domain=muni.il / + suffix domain=idf.il / !-- im : https://www.nic.im/pdfs/imfaqs.pdf-- suffix domain=co.im / @@ -2854,6 +2889,11 @@ suffix domain=org.mw / !-- mx : http://www.nic.mx/-- +suffix domain=com.mx / +suffix domain=edu.mx / +suffix domain=gob.mx / +suffix domain=net.mx / +suffix domain=org.mx / !-- my : http://www.mynic.net.my/-- @@ -3661,6 +3701,19 @@ !-- nu : http://en.wikipedia.org/wiki/.nu-- !-- nz : http://en.wikipedia.org/wiki/.nz-- + suffix domain=ac.nz / + suffix domain=co.nz / + suffix domain=cri.nz / + suffix domain=geek.nz / + suffix domain=gen.nz / + suffix domain=govt.nz / + suffix domain=iwi.nz / + suffix domain=maori.nz / + suffix domain=mil.nz / + suffix domain=net.nz / + suffix domain=org.nz / + suffix domain=parliament.nz / + suffix domain=school.nz / !-- om : http://en.wikipedia.org/wiki/.om-- @@ -4344,7 +4397,28 @@ !-- yu : http://www.nic.yu/pravilnik-e.html-- -!-- za : http://www.zadna.org.za/slds.html-- +!-- za : http://www.zadna.org.za/slds.html + http://en.wikipedia.org/wiki/.za + -- +suffix domain=ac.za / +suffix domain=city.za / +suffix domain=co.za / +suffix domain=edu.za / +suffix domain=gov.za / +suffix domain=law.za / +suffix domain=mil.za / +suffix domain=nom.za / +suffix domain=org.za / +suffix domain=school.za / +suffix domain=ecape.school.za / +suffix domain=fs.school.za / +suffix domain=gp.school.za / +suffix domain=kzn.school.za / +suffix domain=mpm.school.za / +suffix domain=ncape.school.za / +suffix domain=lp.school.za / +suffix domain=nw.school.za / +suffix domain=wcape.school.za / !-- zm : http://en.wikipedia.org/wiki/.zm--
svn commit: r905550 [1/2] - /lucene/nutch/trunk/conf/tika-mimetypes.xml
Author: jnioche Date: Tue Feb 2 09:31:19 2010 New Revision: 905550 URL: http://svn.apache.org/viewvc?rev=905550view=rev Log: NUTCH-781 : updated tika-mimetypes.xml Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
svn commit: r905228 - in /lucene/nutch/trunk/lib: tika-core-0.5.jar tika-core-0.6.jar
Author: jnioche Date: Mon Feb 1 09:59:50 2010 New Revision: 905228 URL: http://svn.apache.org/viewvc?rev=905228view=rev Log: NUTCH-781: upgrade tika to version 0.6 Added: lucene/nutch/trunk/lib/tika-core-0.6.jar (with props) Removed: lucene/nutch/trunk/lib/tika-core-0.5.jar Added: lucene/nutch/trunk/lib/tika-core-0.6.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.6.jar?rev=905228view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/tika-core-0.6.jar -- svn:mime-type = application/octet-stream
svn commit: r905229 - /lucene/nutch/trunk/CHANGES.txt
Author: jnioche Date: Mon Feb 1 10:03:07 2010 New Revision: 905229 URL: http://svn.apache.org/viewvc?rev=905229view=rev Log: NUTCH-781: upgrade tika to version 0.6 Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905229r1=905228r2=905229view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Feb 1 10:03:07 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-781 Update Tika to v0.6 (jnioche) + * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) * NUTCH-655 Injecting Crawl metadata (jnioche)
svn commit: r897825 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/MimeUtil.java test/org/apache/nutch/protocol/TestContent.java
Author: jnioche Date: Mon Jan 11 10:13:21 2010 New Revision: 897825 URL: http://svn.apache.org/viewvc?rev=897825view=rev Log: fix for NUTCH-767 : reverted original expected values for test + treat text/plain as a default mime-type from Tika Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=897825r1=897824r2=897825view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Mon Jan 11 10:13:21 2010 @@ -159,6 +159,7 @@ if (this.mimeMagic) { MimeType magicType = this.mimeTypes.getMimeType(data); if (magicType != null !magicType.getName().equals(MimeTypes.OCTET_STREAM) + !magicType.getName().equals(MimeTypes.PLAIN_TEXT) type != null !type.getName().equals(magicType.getName())) { // If magic enabled and the current mime type differs from that of the // one returned from the magic, take the magic mimeType Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=897825r1=897824r2=897825view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Jan 11 10:13:21 2010 @@ -63,28 +63,19 @@ http://www.foo.com/;, .getBytes(UTF8), text/html; charset=UTF-8, p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals(text/plain, c.getContentType()); +assertEquals(text/html, c.getContentType()); c = new Content(http://www.foo.com/foo.html;, http://www.foo.com/;, .getBytes(UTF8), , p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals(text/plain, c.getContentType()); +assertEquals(text/html, c.getContentType()); c = new Content(http://www.foo.com/foo.html;, http://www.foo.com/;, .getBytes(UTF8), null, p, conf); -// TODO check potential Tika issue and -// revert the expected value to text/html -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals(text/plain, c.getContentType()); +assertEquals(text/html, c.getContentType()); c = new Content(http://www.foo.com/;, http://www.foo.com/;, @@ -108,10 +99,7 @@ http://www.foo.com/;, .getBytes(UTF8), , p, conf); -// TODO check that Tika returns the right value and -// revert to the default type -// see https://issues.apache.org/jira/browse/NUTCH-767 -assertEquals(text/plain, c.getContentType()); +assertEquals(MimeTypes.OCTET_STREAM, c.getContentType()); c = new Content(http://www.foo.com/;, http://www.foo.com/;,
svn commit: r897180 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: jnioche Date: Fri Jan 8 12:01:46 2010 New Revision: 897180 URL: http://svn.apache.org/viewvc?rev=897180view=rev Log: NUTCH-269 : OOME because no upper-bound on inlinks count (stack + jnioche) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=897180r1=897179r2=897180view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Jan 8 12:01:46 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche) + * NUTCH-655 Injecting Crawl metadata (jnioche) * NUTCH-658 Use counters to report fetching and parsing status (jnioche) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=897180r1=897179r2=897180view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan 8 12:01:46 2010 @@ -384,6 +384,14 @@ /property property + namedb.update.max.inlinks/name + value1/value + descriptionMaximum number of inlinks to take into account when updating + a URL score in the crawlDB. Only the best scoring inlinks are kept. + /description +/property + +property namedb.ignore.internal.links/name valuetrue/value descriptionIf true, when adding new links to a page, links from Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=897180r1=897179r2=897180view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Jan 8 12:01:46 2010 @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import java.io.IOException; // Commons Logging imports @@ -27,6 +28,7 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.PriorityQueue; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; @@ -37,7 +39,7 @@ private int retryMax; private CrawlDatum result = new CrawlDatum(); - private ArrayListCrawlDatum linked = new ArrayListCrawlDatum(); + private InlinkPriorityQueue linked = null; private ScoringFilters scfilters = null; private boolean additionsAllowed; private int maxInterval; @@ -51,6 +53,8 @@ maxInterval = job.getInt(db.fetch.interval.max, 0 ); if (oldMaxInterval 0 maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); +int maxLinks = job.getInt(db.update.max.inlinks, 1); +linked = new InlinkPriorityQueue(maxLinks); } public void close() {} @@ -111,7 +115,7 @@ } else { link = datum; } -linked.add(link); +linked.insert(link); break; case CrawlDatum.STATUS_SIGNATURE: signature = datum.getSignature(); @@ -120,13 +124,21 @@ LOG.warn(Unknown status, key: + key + , datum: + datum); } } - + +// copy the content of the queue into a List +// in reversed order +int numLinks = linked.size(); +ListCrawlDatum linkList = new ArrayListCrawlDatum(numLinks); +for (int i = numLinks - 1; i = 0; i--) { + linkList.add(linked.pop()); +} + // if it doesn't already exist, skip it if (!oldSet !additionsAllowed) return; // if there is no fetched datum, perhaps there is a link -if (!fetchSet linked.size() 0) { - fetch = linked.get(0); +if (!fetchSet linkList.size() 0) { + fetch = linkList.get(0); fetchSet = true; } @@ -260,7 +272,7 @@ } try { - scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linked); + scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linkList); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(Couldn't update score, key= + key + : + e); @@ -270,5 +282,20 @@ result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY); output.collect(key, result); } + +} +class InlinkPriorityQueue extends PriorityQueueCrawlDatum { + + public InlinkPriorityQueue(int maxSize) { +initialize(maxSize); + } + + /** Determines the ordering of objects
svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java
Author: jnioche Date: Wed Jan 6 17:01:51 2010 New Revision: 896539 URL: http://svn.apache.org/viewvc?rev=896539view=rev Log: NUTCH-655 : Injecting Crawl metadata (jnioche) Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539r1=896538r2=896539view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Jan 6 17:01:51 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-655 Injecting Crawl metadata (jnioche) + * NUTCH-658 Use counters to report fetching and parsing status (jnioche) * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539r1=896538r2=896539view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan 6 17:01:51 2010 @@ -37,10 +37,21 @@ import org.apache.nutch.util.NutchJob; /** This class takes a flat file of URLs and adds them to the of pages to be - * crawled. Useful for bootstrapping the system. */ + * crawled. Useful for bootstrapping the system. + * The URL files contain one URL per line, optionally followed by custom metadata + * separated by tabs with the metadata key separated from the corresponding value by '='. br + * Note that some metadata keys are reserved : br + * - inutch.score/i : allows to set a custom score for a specific URL br + * - inutch.fetchInterval/i : allows to set a custom fetch interval for a specific URL br + * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source + **/ public class Injector extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(Injector.class); - + + /** metadata key reserved for setting a custom score for a specific URL */ + public static String nutchScoreMDName = nutch.score; + /** metadata key reserved for setting a custom fetchInterval for a specific URL */ + public static String nutchFetchIntervalMDName = nutch.fetchInterval; /** Normalize and filter injected urls. */ public static class InjectMapper implements MapperWritableComparable, Text, Text, CrawlDatum { @@ -68,6 +79,36 @@ OutputCollectorText, CrawlDatum output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text + // if tabs : metadata that could be stored + // must be name=value and separated by \t + float customScore = -1f; + int customInterval = interval; + MapString,String metadata = new TreeMapString,String(); + if (url.indexOf(\t)!=-1){ + String[] splits = url.split(\t); + url = splits[0]; + for (int s=1;ssplits.length;s++){ + // find separation between name and value + int indexEquals = splits[s].indexOf(=); + if (indexEquals==-1) { + // skip anything without a = + continue; + } + String metaname = splits[s].substring(0, indexEquals); + String metavalue = splits[s].substring(indexEquals+1); + if (metaname.equals(nutchScoreMDName)) { + try { + customScore = Float.parseFloat(metavalue);} + catch (NumberFormatException nfe){} + } + else if (metaname.equals(nutchFetchIntervalMDName)) { + try { + customInterval = Integer.parseInt(metavalue);} + catch (NumberFormatException nfe){} + } + else metadata.put(metaname,metavalue); + } + } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url @@ -77,17 +118,27 @@ } if (url != null) { // if it passes value.set(url); // collect it -CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, interval); +CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval); datum.setFetchTime(curTime); -datum.setScore(scoreInjected); -try { - scfilters.injectedScore(value, datum); -} catch (ScoringFilterException e) { - if (LOG.isWarnEnabled()) { -LOG.warn(Cannot filter
svn commit: r895972 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/ProtocolSt
Author: jnioche Date: Tue Jan 5 10:14:49 2010 New Revision: 895972 URL: http://svn.apache.org/viewvc?rev=895972view=rev Log: NUTCH-658 : Add Counter for # of doc fetched in Reporter Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972r1=895971r2=895972view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Jan 5 10:14:49 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-658 Use counters to report fetching and parsing status (jnioche) + * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann) * NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via ab) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972r1=895971r2=895972view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan 5 10:14:49 2010 @@ -607,6 +607,7 @@ LOG.debug(Denied by robots.txt: + fit.url); } output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); +reporter.incrCounter(FetcherStatus, robots_denied, 1); continue; } if (rules.getCrawlDelay() 0) { @@ -615,6 +616,7 @@ fetchQueues.finishFetchItem(fit, true); LOG.debug(Crawl-Delay for + fit.url + too long ( + rules.getCrawlDelay() + ), skipping); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); + reporter.incrCounter(FetcherStatus, robots_denied_maxcrawldelay, 1); continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); @@ -630,6 +632,8 @@ String urlString = fit.url.toString(); + reporter.incrCounter(FetcherStatus, status.getName(), 1); + switch(status.getCode()) { case ProtocolStatus.WOULDBLOCK: @@ -664,6 +668,7 @@ } else { // stop redirecting redirecting = false; + reporter.incrCounter(FetcherStatus, FetchItem.notCreated.redirect, 1); } } } @@ -701,6 +706,7 @@ } else { // stop redirecting redirecting = false; +reporter.incrCounter(FetcherStatus, FetchItem.notCreated.redirect, 1); } } else { // stop redirecting @@ -926,6 +932,7 @@ if (parseResult != null !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { + reporter.incrCounter(ParserStatus, ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972r1=895971r2=895972view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 5 10:14:49 2010 @@ -93,6 +93,8 @@ Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); + reporter.incrCounter(ParserStatus, ParseStatus.majorCodes[parseStatus.getMajorCode()], 1); + if (!parseStatus.isSuccess()) { LOG.warn(Error parsing: + key + : + parseStatus); parse = parseStatus.getEmptyParse(getConf()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=895972r1=895971r2=895972view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol
svn commit: r894716 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml
Author: jnioche Date: Wed Dec 30 21:34:28 2009 New Revision: 894716 URL: http://svn.apache.org/viewvc?rev=894716view=rev Log: Adding J. Nioche to the list of committers Modified: lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/credits.pdf lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?rev=894716r1=894715r2=894716view=diff == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Wed Dec 30 21:34:28 2009 @@ -252,6 +252,10 @@ /li li +a href=http://www.digitalpebble.com/;Julien Nioche/a +/li + +li a href=http://people.apache.org/~siren;Sami Siren/a /li @@ -261,7 +265,7 @@ /div -a name=N10042/aa name=Friends/a +a name=N10047/aa name=Friends/a h2 class=h3Friends/h2 div class=section ul @@ -292,7 +296,7 @@ /div -a name=N1008C/aa name=Sponsors/a +a name=N10091/aa name=Sponsors/a h2 class=h3Sponsors/h2 div class=section ul Modified: lucene/nutch/trunk/site/credits.pdf URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?rev=894716r1=894715r2=894716view=diff == --- lucene/nutch/trunk/site/credits.pdf (original) +++ lucene/nutch/trunk/site/credits.pdf Wed Dec 30 21:34:28 2009 @@ -58,10 +58,10 @@ endobj 14 0 obj - /Length 2451 /Filter [ /ASCII85Decode /FlateDecode ] + /Length 2523 /Filter [ /ASCII85Decode /FlateDecode ] stream -Gat=-?#uMo'RekG6)QN,j4X4IDDVKe8Cd99l3ZjVX=Q[b.BLBhuXiFrUmi*,VH1...@__ei(`FER#:35J-3KkLmIc0$E/-9at+C5'JL_g:M-TJq:j=p'n#r(rJ*A@`ZXIRrcMN2^?jo+TV(?8=o8Z43rXNS`.lKsK^`(anX=FV;m0$Sh;[*WTGKJTabq4PBnG%HT1]HqfD`^fK]+DMGC][A;AauDhV\=BF-6%+--+,-R^Q`J-rq1^/aI!E7A77`*g/j,2T[+;*_3p]F4O:C]]NLjH]*Wa3p$E[tVeq1,Q9`XAoU`^21U3M3(8Y,Y8+d:rLd/?ErprgkLDtPtOH9;iBRt,/sV1!^kLNh#W?Xc;Z=^0N*TO_9#QGEfj)K%-/N_]VP#g)tHj?tSb1CVc1X8#u7Ig^0aenmP)fY)!'l;.LN!;sR5V8_qKq?BssHf[1`%[p=)^/[j['-n/*i...@j8c=+o1v+oasfe;/X[k+:G!Fa`7,=6%UB(je=+0Jk]pJuk0$Fulw0jqci.wtfp?...@jip'*(4IN(nFWUC'4$+ASc0$'Hb0).HOX.nYPOI'ZfjA4lGoAq.OB7pr*@;,dS]W^Y..TVojXN,X0'^8csme]rov...@mrm=7Ume#n129mnp^[\n!AiOV7^V6PJ*$4n5)Ta(oU;!=fZ7KED][iPuL2sG4B2QRR;R8h]pal\kC5TJ`i!B1dgG;M/ZcccB^2R[RqCf=sJoXH_T-PK\n,\QIE;N:-%H:i6IS9GRTL\a#TIr0[=g'YE]#JSS-7VY96Wr.ok]i=r\uB18i+V7Ss'SGHVU[Qb5/Q5TWMK3h(]eqes3?1^...@ad(So.oH1',$NU^Vd0*s7V-7,/:Z[3MHCgmC(j-Qr=m,Sk0?Vek)h6``t6vuk#7h5gqp...@p.[pq#d3goh`ulzV]:Xb:Hp^fcB(!R F73]PiP4t\GcOq?o]khE6Y22'f]RU%Dng'BBRjXCeQL1A4N6Sf...@aa$iphpn0eW`2/dlc)_DpfE4...@s(+[R;P?Rin#F'!O_VKp/mC^:W\HgRJ*)iCHjY0+n`Fo^;UV,l...@r/P-0YPe`DIbReP7Ok^:5L1ee7)3\(au1D0)OB\]4nLeh+trE:]o[ep'HMp0S6f*#u`m_=1)$r+/?0fr%8)ZF4M9-8$qiO8pl$RTRX;'t5i!`R=2hX%*)iE^Beb'hN'B_]]Q9K$mCB;OKPjEB#qBR8FHImcT#TBgL.Y1X%WTY=K?+bCYbdm?3_Q,4o=AA(i]nk\S*G)?IJnPSJACaqk81+WAU$53q%...@1-fj`h8#K`Kbreb;?jHe0.q[!7dAKB7am6Gc+o6UEhb-kk2j:)X;T.C0o[$Osp`HGL*)*aqOa!...@u:!nGBQoda#s8BMf2l]$XQn0D%X)b'213CgRMXW+^tAD'KIJ5,Sk/mc7!*...@$xk*%*p=z.(r-Vcj#EOrcOdWY'T@sLig2Y'taK-,a-qM[rFHonKAtmbo1ejl28\D7UmR'edeKrfRJNN6G'FB5XBB=$3rGK9B(0DX,]nr`d...@`-q)pJ?'%5m-cho3$21...@l!4?0yck3^lHrB4+UOCPg#tMPmHhVcMDK0f-5e+uxw]j#...@qs-%sbew.jf15ga\^`,1S+C=C-df;1RW])_L(N8X,+O2I;PjoBEdqj[4Bb`o8s'nn'=NZ0ZN-f]T05Hk/tG(2[H)qDrNDb0!qqU^G'Zp4:7+dW)^0sm#,\V;7V0cjr1?65,n5neQ]uX,I?i]`03tf*jceD7Ai2le5*K]0;mM5YCq`!ofr]a8\.SR$.PQBW:iuM@:=...@o5j8t`m*m[rha9?iqa7'0q\7-8L)l5hKBnL'aVDO%.'DiEo(PG2^...@]i$qlml01\/ 2B4k5s50%$QKEcg-JXBIlL(H]XQic*j4!D2]JL2)q4`D](P7bS1u/um*'dp)f%/qYmj-Z?$AUlBL1%XlNB'tb/EI[Hn+K8[3Y8$Ind)?+Q^H,QmfH2%'N%_oPR?e$KQgk$dJLFe)cW2K6.Q%P8Zo1kRAhX59n?j...@mez0l[inp$=f$knplk[)_Je...@\!bxq#rkbmc5ss!-'*2##SYi$'OlZ.b%8Sra-l^V8:$KYrhDcQEg/WH3R`D=psS7?oce?FP[7Pe_$g8ck,p+RDZPcD%]pJJI,k...@j3h?p5n:o%3krIkL#pRsD#!?h]F0FGL\q2oJ5:7u3j.f.!$/5=L,M8k5bmiXYO9tD%($\s6U-#QdabpH@gbM8BONu7[aH:N\^fLQb6,C?YISJ#LRlDar#mfYU4Fk*_:IRVLV?eD)?r#-0;hW[7M/pHR5ZLdq~ +Gat=.gNfR:NH+I/skrF!LtiR*u_M./K*UroO.4X$)QZs;\?nFHIT,qam^^^`%^vg*j...@ah*$digos4s'*4h9'\kgPU0t;o5m7...@1eb.1$uf_9g1=lniw#-T^ZjK7)qgDMsZE.@:Ig7nKWul4[s+7AY5@:h,KSD5E!GsAo+\kBCPVTaQrORMGYs!1bM23u/!\(HFEC#eiN5U\Qq3s-...@fq1r(m%Ktb[WS\4ue)+/C6=IN_'o$_khc92e...@$[$81.ha[n4eqh1jn@Wat_:h8CACsU_H6qf==ak]S$1Q#_\,o.Cdrh[?V9EYAl2ZQojP:a...@apgbsf34l6d%25@$;eS!,MWHdkHWlp-u0hZ[1MHe$:Oq,mjh[08gu1...@?no=hc0'?1E4O_25*I!Cu'o.d0]$I4o`?b6r5R[VqJHtTH0J\W](IW#f3,huj]v]t5mn...@\j:0YBkS%m8=K5g;C0^XQEJU[E1Pf:kk-Zq_e+Mf4PFQ`h2Y188-k]p#g(@,5VfseCbE?F:`3(CDk$A_c#$'6^EnKoo'=DVP[cj(1`Z!/GLqMlI'_PZ=j'(/nk...@fmq3\jo#r1]S]l-i,?Ph-``Gi-s;WK:i$e4Z``]21SYbbc$nE5rp]=[@(H\rE*[qO7#Ynt4#%'?4'qE87L)qbI0Jpm,!pItf#5'$l$ec7b_j,l[#2co1DBg(Z_cUknM!=eoouHLA/R09=REWfcsp#HaX$g%+;k.b)A5`W!ateTpHt/+0leNc/VuN[:mh^84EM8?!]/Z6e'(ch95enOV7h.L'?p:(esRj9'XYQ4`BF1r1#H(tG6F0S8$epBAbhb6Oiboc`RZIEJ%0XX*Wc+Z'O$.q\)u=Ws3F:cg9n]iv[...@+4udv.e-b:N7`0^BeYXnBmjdfWIcI0p^]7XrK7f)7o8Cb9DNcU*5f)Y$ fZ/@DdNgr_D,g...@iup?phkyk5$kx!m0_fthr=hu