svn commit: r931098 - in /lucene/nutch/trunk: ./ conf/ lib/ src/plugin/ src/plugin/parse-tika/ src/plugin/parse-tika/lib/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/

2010-04-06 Thread jnioche
Author: jnioche
Date: Tue Apr  6 11:38:26 2010
New Revision: 931098

URL: http://svn.apache.org/viewvc?rev=931098view=rev
Log:
NUTCH-810 Upgraded to Tika 0.7

Added:
lucene/nutch/trunk/lib/tika-core-0.7.jar   (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar   (with 
props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar   (with props)
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar   (with 
props)
Removed:
lucene/nutch/trunk/lib/tika-core-0.6.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar
lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/tika-mimetypes.xml
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml

lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java

lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098r1=931097r2=931098view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Apr  6 11:38:26 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-810 Upgrade to Tika 0.7 (jnioche)
+
 * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call 
scfilters.initialScore on newly created URL (jnioche)
 
 * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)

Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098r1=931097r2=931098view=diff
==
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (original)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr  6 11:38:26 2010
@@ -2198,7 +2198,11 @@
 
   mime-type type=application/x-cpio
 magic priority=50
-  match value=070707 type=host16 offset=0/
+  match value=070707 type=little16 offset=0/
+  match value=070707 type=big16 offset=0/
+  match value=070707 type=string offset=0/
+  match value=070701 type=string offset=0/
+  match value=070702 type=string offset=0/
 /magic
 glob pattern=*.cpio/
   /mime-type
@@ -3551,7 +3555,13 @@
   bad HTML, unfortunately.
  --
 root-XML localName=html/
+root-XML localName=HTML/
 root-XML localName=link/
+root-XML localName=LINK/
+root-XML localName=body/
+root-XML localName=BODY/
+root-XML localName=p/
+root-XML localName=P/
 magic priority=50
   match value=lt;!DOCTYPE HTML type=string offset=0:64/
   match value=lt;!doctype html type=string offset=0:64/

Added: lucene/nutch/trunk/lib/tika-core-0.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098r1=931097r2=931098view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr  6 11:38:26 2010
@@ -32,8 +32,8 @@
  ant dir=index-basic target=deploy/
  ant dir=index-anchor target=deploy/
  ant dir=index-more target=deploy/
-ant dir=field-basic target=deploy/
-ant dir=field-boost target=deploy/
+ ant dir=field-basic target=deploy/
+ ant dir=field-boost target=deploy/
  ant dir=languageidentifier target=deploy/
  ant dir=lib-http target=deploy/
  ant dir=lib-jakarta-poi target=deploy/
@@ -65,12 +65,12 @@
  ant dir=query-basic target=deploy/
  ant dir=query-more target=deploy/
  ant dir=query-site target=deploy/
-ant dir=query-custom target=deploy/
+ ant dir=query-custom target=deploy/
  ant dir=query-url target=deploy/
  ant dir=response-json target=deploy/
  ant dir

svn commit: r926003 - in /lucene/nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/pro

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 09:00:11 2010
New Revision: 926003

URL: http://svn.apache.org/viewvc?rev=926003view=rev
Log:
NUTCH-740 Configuration option to override default language for fetched pages

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926003r1=926002r2=926003view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 09:00:11 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)
+
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)
 
 * NUTCH-787 Upgrade Lucene to 3.0.1. (Dawid Weiss via ab)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926003r1=926002r2=926003view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 09:00:11 2010
@@ -228,6 +228,15 @@
   /description
 /property
 
+property
+  namehttp.accept.language/name
+  valueen-us,en-gb,en;q=0.7,*;q=0.3/value
+  descriptionValue of the Accept-Language request header field.
+  This allows selecting non-English language as default one to retrieve.
+  It is a useful setting for search engines build for certain national group.
+  /description
+/property
+
 !-- FTP properties --
 
 property

Modified: 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=926003r1=926002r2=926003view=diff
==
--- 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon Mar 22 09:00:11 2010
@@ -93,6 +93,8 @@ public abstract class HttpBase implement
 http://lucene.apache.org/nutch/bot.html;,
 nutch-ag...@lucene.apache.org);
 
+  /** The Accept-Language request header value. */
+  protected String acceptLanguage = en-us,en-gb,en;q=0.7,*;q=0.3;
 
   /**
* Maps from host to a Long naming the time it should be unblocked.
@@ -162,6 +164,7 @@ public abstract class HttpBase implement
 this.maxThreadsPerHost = conf.getInt(fetcher.threads.per.host, 1);
 this.userAgent = getAgentString(conf.get(http.agent.name), 
conf.get(http.agent.version), conf
 .get(http.agent.description), conf.get(http.agent.url), 
conf.get(http.agent.email));
+this.acceptLanguage = conf.get(http.accept.language, acceptLanguage);
 this.serverDelay = (long) (conf.getFloat(fetcher.server.delay, 1.0f) 
* 1000);
 this.maxCrawlDelay = (long)(conf.getInt(fetcher.max.crawl.delay, -1) 
* 1000);
 // backward-compatible default setting
@@ -326,6 +329,13 @@ public abstract class HttpBase implement
 return userAgent;
   }
   
+  /** Value of Accept-Language request header sent by Nutch.
+   * @return The value of the header Accept-Language header.
+   */
+  public String getAcceptLanguage() {
+ return acceptLanguage;
+  }
+
   public boolean getUseHttp11() {
 return useHttp11;
   }
@@ -470,6 +480,7 @@ public abstract class HttpBase implement
   logger.info(http.timeout =  + timeout);
   logger.info(http.content.limit =  + maxContent);
   logger.info(http.agent =  + userAgent);
+  logger.info(http.accept.language =  + acceptLanguage);
   logger.info(Protocol.CHECK_BLOCKING +  =  + checkBlocking);
   logger.info(Protocol.CHECK_ROBOTS +  =  + checkRobots);
   if (checkBlocking) {

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=926003r1=926002r2=926003view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java

svn commit: r926155 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/tools/ src/test/org/apache/nutch/crawl/ src/test/org/ap

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 16:19:12 2010
New Revision: 926155

URL: http://svn.apache.org/viewvc?rev=926155view=rev
Log:
NUTCH-762 : Generator can generate several segments in one parse of the crawlDB

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java
Removed:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=926155r1=926154r2=926155view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 22 16:19:12 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-762 Generator can generate several segments in one parse of the 
crawlDB (jnioche)
+
 * NUTCH-740 Configuration option to override default language for fetched 
pages (Marcin Okraszewski via jnioche)
 
 * NUTCH-803 Upgrade to Hadoop 0.20.2 (ab)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=926155r1=926154r2=926155view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 22 16:19:12 2010
@@ -514,24 +514,21 @@
 !-- generate properties --
 
 property
-  namegenerate.max.per.host/name
+  namegenerate.max.count/name
   value-1/value
-  descriptionThe maximum number of urls per host in a single
-  fetchlist.  -1 if unlimited./description
+  descriptionThe maximum number of urls in a single
+  fetchlist.  -1 if unlimited. The urls are counted according
+  to the value of the parameter generator.count.mode.
+  /description
 /property
 
 property
-  namegenerate.max.per.host.by.ip/name
-  valuefalse/value
-  descriptionIf false, same host names are counted. If true,
-  hosts' IP addresses are resolved and the same IP-s are counted.
-  
-  -+-+-+- WARNING !!! -+-+-+-
-  When set to true, Generator will create a lot of DNS lookup
-  requests, rapidly. This may cause a DOS attack on
-  remote DNS servers, not to mention increased external traffic
-  and latency. For these reasons when using this option it is
-  required that a local caching DNS be used./description
+  namegenerate.count.mode/name
+  valuehost/value
+  descriptionDetermines how the URLs are counted for generator.max.count.
+  Default value is 'host' but can be 'domain'. Note that we do not count 
+  per IP in the new version of the Generator.
+  /description
 /property
 
 property
@@ -545,6 +542,34 @@
   updatedb will generate identical fetchlists./description
 /property
 
+property
+  namegenerate.max.per.host/name
+  value-1/value
+  description(Deprecated). Use generate.max.count and generate.count.mode 
instead.
+  The maximum number of urls per host in a single
+  fetchlist.  -1 if unlimited./description
+/property
+
+!-- urlpartitioner properties --
+property
+  namepartition.url.mode/name
+  valuebyHost/value
+  descriptionDetermines how to partition URLs. Default value is 'byHost', 
+  also takes 'byDomain' or 'byIP'. 
+  /description
+/property
+
+property
+  namecrawl.gen.delay/name
+  value60480/value
+  description
+   This value, expressed in days, defines how long we should keep the lock on 
records 
+   in CrawlDb that were just selected for fetching. If these records are not 
updated 
+   in the meantime, the lock is canceled, i.e. the become eligible for 
selecting. 
+   Default value of this is 7 days.
+  /description
+/property
+
 !-- fetcher properties --
 
 property

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=926155r1=926154r2=926155view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Mar 22 
16:19:12 2010
@@ -124,17 +124,17 @@ public class Crawl {
 injector.inject(crawlDb, rootUrlDir);
 int i;
 for (i = 0; i  depth; i++) { // generate new segment
-  Path segment = generator.generate(crawlDb, segments, -1, topN, System
+  Path[] segs = generator.generate(crawlDb, segments, -1, topN, System

svn commit: r926163 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

2010-03-22 Thread jnioche
Author: jnioche
Date: Mon Mar 22 16:29:30 2010
New Revision: 926163

URL: http://svn.apache.org/viewvc?rev=926163view=rev
Log:
fixed NPE introduced in NUTCH-762

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=926163r1=926162r2=926163view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Mar 
22 16:29:30 2010
@@ -480,7 +480,7 @@ public class Generator extends Configure
   LOG.info(Generator: topN:  + topN);
 }
 
-if (getConf().get(GENERATE_MAX_PER_HOST_BY_IP).equals(true)){
+if (true.equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){
   LOG.info(Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use 
partition.url.mode instead);
 }
 




svn commit: r921831 - in /lucene/nutch/trunk: ./ lib/

2010-03-11 Thread jnioche
Author: jnioche
Date: Thu Mar 11 13:06:12 2010
New Revision: 921831

URL: http://svn.apache.org/viewvc?rev=921831view=rev
Log:
NUTCH-798 : Upgrade to SOLR1.4 and its dependencies

Added:
lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar   (with props)
lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar   (with props)
lucene/nutch/trunk/lib/commons-httpclient-3.1.jar   (with props)
lucene/nutch/trunk/lib/commons-io-1.4.jar   (with props)
lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar   (with props)
lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar   (with props)
lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar   (with props)
lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar   (with props)
Removed:
lucene/nutch/trunk/lib/apache-solr-common-1.3.0.jar
lucene/nutch/trunk/lib/apache-solr-solrj-1.3.0.jar
lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar
lucene/nutch/trunk/lib/slf4j-api-1.4.3.jar
Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921831r1=921830r2=921831view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:06:12 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche)
+
 * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)
 
 * NUTCH-782 Ability to order htmlparsefilters (jnioche)

Added: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/apache-solr-core-1.4.0.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/apache-solr-solrj-1.4.0.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.1.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/commons-io-1.4.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-io-1.4.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-io-1.4.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/geronimo-stax-api_1.0_spec-1.0.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/jcl-over-slf4j-1.5.5.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar?rev=921831view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/slf4j-api-1.5.5.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/wstx-asl-3.2.7.jar?rev=921831view=auto

svn commit: r921840 - in /lucene/nutch/trunk: CHANGES.txt conf/parse-plugins.xml src/plugin/build.xml src/plugin/parse-mp3/ src/plugin/parse-rtf/

2010-03-11 Thread jnioche
Author: jnioche
Date: Thu Mar 11 13:25:44 2010
New Revision: 921840

URL: http://svn.apache.org/viewvc?rev=921840view=rev
Log:
NUTCH-801 Remove RTF and MP3 parse plugins

Removed:
lucene/nutch/trunk/src/plugin/parse-mp3/
lucene/nutch/trunk/src/plugin/parse-rtf/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/parse-plugins.xml
lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=921840r1=921839r2=921840view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Mar 11 13:25:44 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-801 Remove RTF and MP3 parse plugins (jnioche)
+
 * NUTCH-798 Upgrade to SOLR1.4 and its dependencies (jnioche)
 
 * NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/parse-plugins.xml?rev=921840r1=921839r2=921840view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Mar 11 13:25:44 2010
@@ -124,13 +124,11 @@
/mimeType
 
mimeType name=text/richtext
-   plugin id=parse-rtf /
-   plugin id=parse-msword /
+   plugin id=parse-tika /
/mimeType
 
mimeType name=text/rtf
-   plugin id=parse-rtf /
-   plugin id=parse-msword /
+   plugin id=parse-tika /
/mimeType
 
mimeType name=text/sgml
@@ -198,8 +196,6 @@
alias name=parse-html
extension-id=org.apache.nutch.parse.html.HtmlParser /
alias name=parse-js extension-id=JSParser /
-   alias name=parse-mp3
-   extension-id=org.apache.nutch.parse.mp3.MP3Parser /
alias name=parse-msexcel

extension-id=org.apache.nutch.parse.msexcel.MSExcelParser /
alias name=parse-mspowerpoint
@@ -212,10 +208,8 @@
extension-id=org.apache.nutch.parse.pdf.PdfParser /
alias name=parse-rss
extension-id=org.apache.nutch.parse.rss.RSSParser /
-alias name=feed
-extension-id=org.apache.nutch.parse.feed.FeedParser /
-   alias name=parse-rtf
-   
extension-id=org.apache.nutch.parse.rtf.RTFParseFactory /
+   alias name=feed
+   extension-id=org.apache.nutch.parse.feed.FeedParser /
alias name=parse-swf
extension-id=org.apache.nutch.parse.swf.SWFParser /
alias name=parse-text

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=921840r1=921839r2=921840view=diff
==
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Thu Mar 11 13:25:44 2010
@@ -52,14 +52,12 @@
  ant dir=parse-ext target=deploy/
  ant dir=parse-html target=deploy/
  ant dir=parse-js target=deploy/
- !-- ant dir=parse-mp3 target=deploy/ --
  ant dir=parse-msexcel target=deploy/
  ant dir=parse-mspowerpoint target=deploy/
  ant dir=parse-msword target=deploy/
  ant dir=parse-oo target=deploy/
  ant dir=parse-pdf target=deploy/
  ant dir=parse-rss target=deploy/
- !-- ant dir=parse-rtf target=deploy/ --
  ant dir=parse-swf target=deploy/
  ant dir=parse-text target=deploy/
  ant dir=parse-tika target=deploy/




svn commit: r919358 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrIndexer.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java

2010-03-05 Thread jnioche
Author: jnioche
Date: Fri Mar  5 10:09:08 2010
New Revision: 919358

URL: http://svn.apache.org/viewvc?rev=919358view=rev
Log:
NUTCH-799 SOLRIndexer to commit once all reducers have finished

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=919358r1=919357r2=919358view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar  5 10:09:08 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-799 SOLRIndexer to commit once all reducers have finished (jnioche)
+
 * NUTCH-782 Ability to order htmlparsefilters (jnioche)
 
 * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=919358r1=919357r2=919358view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java 
Fri Mar  5 10:09:08 2010
@@ -37,6 +37,8 @@
 import org.apache.nutch.indexer.NutchIndexWriterFactory;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 
 public class SolrIndexer extends Configured implements Tool {
 
@@ -71,6 +73,12 @@
 FileOutputFormat.setOutputPath(job, tmp);
 try {
   JobClient.runJob(job);
+  // do the commits once and for all the reducers in one go
+  SolrServer solr =  new CommonsHttpSolrServer(solrUrl);
+  solr.commit();
+} 
+catch (Exception e){
+  LOG.error(e);
 } finally {
   FileSystem.get(job).delete(tmp, true);
 }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=919358r1=919357r2=919358view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java 
Fri Mar  5 10:09:08 2010
@@ -74,7 +74,7 @@
 solr.add(inputDocs);
 inputDocs.clear();
   }
-  solr.commit();
+  // solr.commit();
 } catch (final SolrServerException e) {
   throw makeIOException(e);
 }




svn commit: r917557 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/parse/HtmlParseFilters.java

2010-03-01 Thread jnioche
Author: jnioche
Date: Mon Mar  1 15:08:05 2010
New Revision: 917557

URL: http://svn.apache.org/viewvc?rev=917557view=rev
Log:
NUTCH-782: Ability to order htmlparsefilters

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557r1=917556r2=917557view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  1 15:08:05 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-782 Ability to order htmlparsefilters (jnioche)
+
 * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 
 
 * NUTCH-790 Some external javadoc links are broken (siren)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557r1=917556r2=917557view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar  1 15:08:05 2010
@@ -996,6 +996,18 @@
   for most people would be img,script,link./description
 /property
 
+property
+  namehtmlparsefilter.order/name
+  value/value
+  descriptionThe order by which HTMLParse filters are applied.
+  If empty, all available HTMLParse filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order.
+  HTMLParse filter ordering MAY have an impact
+  on end result, as some filters could rely on the metadata generated by a 
previous filter.
+  /description
+/property
 
 !-- urlfilter plugin properties --
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557r1=917556r2=917557view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
Mon Mar  1 15:08:05 2010
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.parse;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.nutch.protocol.Content;
@@ -30,12 +31,23 @@
 public class HtmlParseFilters {
 
   private HtmlParseFilter[] htmlParseFilters;
+  
+  public static final String HTMLPARSEFILTER_ORDER = htmlparsefilter.order;
 
   public HtmlParseFilters(Configuration conf) {
+String order = conf.get(HTMLPARSEFILTER_ORDER);
 ObjectCache objectCache = ObjectCache.get(conf);
 this.htmlParseFilters = (HtmlParseFilter[]) 
objectCache.getObject(HtmlParseFilter.class.getName());
 if (htmlParseFilters == null) {
-HashMapString, HtmlParseFilter filters =
+  /*
+   * If ordered filters are required, prepare array of filters based on
+   * property
+   */
+  String[] orderedFilters = null;
+  if (order != null  !order.trim().equals()) {
+orderedFilters = order.split(\\s+);
+  }
+HashMapString, HtmlParseFilter filterMap =
   new HashMapString, HtmlParseFilter();
 try {
 ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID);
@@ -45,12 +57,31 @@
 for (int i = 0; i  extensions.length; i++) {
 Extension extension = extensions[i];
 HtmlParseFilter parseFilter = (HtmlParseFilter) 
extension.getExtensionInstance();
-if 
(!filters.containsKey(parseFilter.getClass().getName())) {
-filters.put(parseFilter.getClass().getName(), 
parseFilter);
+if 
(!filterMap.containsKey(parseFilter.getClass().getName())) {
+filterMap.put(parseFilter.getClass().getName(), 
parseFilter);
 }
 }
-HtmlParseFilter[] htmlParseFilters = 
filters.values().toArray(new HtmlParseFilter[filters.size()]);
-objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
+HtmlParseFilter[] htmlParseFilters = 
filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]);
+/*
+ * If no ordered filters required, just get the filters in an
+ * indeterminate order
+ */
+if (orderedFilters == null) {
+  objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters

svn commit: r910454 - in /lucene/nutch/trunk/src/plugin/languageidentifier/src: java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.j

2010-02-16 Thread jnioche
Author: jnioche
Date: Tue Feb 16 10:20:22 2010
New Revision: 910454

URL: http://svn.apache.org/viewvc?rev=910454view=rev
Log:
NUTCH-794 : Language Identification must use check the parse metadata for 
language values

Modified:

lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java

lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=910454r1=910453r2=910454view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -91,15 +91,33 @@
 
 Parse parse = parseResult.get(content.getUrl());
 
+String lang = getLanguageFromMetadata(parse.getData().getParseMeta());
+if (lang != null) {
+  parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
+  return parseResult;
+}
+
 // Trying to find the document's language
 LanguageParser parser = new LanguageParser(doc);
-String lang = parser.getLanguage();
+lang = parser.getLanguage();
 
 if (lang != null) {
   parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
 }
 return parseResult;
   }
+  
+  // Check in the metadata whether the language has already been stored there 
by Tika
+  private static String getLanguageFromMetadata(Metadata parseMD){
+// dublin core 
+String lang = parseMD.get(dc.language);
+if (lang!=null) return lang;
+// meta content-language
+lang = parseMD.get(content-language);
+if (lang!=null) return lang;
+// lang attribute
+return parseMD.get(lang);
+  }
 
   static class LanguageParser {
 

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=910454r1=910453r2=910454view=diff
==
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 Tue Feb 16 10:20:22 2010
@@ -40,7 +40,8 @@
   htmlheadmeta http-equiv=\content-language\ 
content=\en\titledocument 2 title/headbodythis is 
english/body/html,
   htmlheadmeta name=\dc.language\ content=\en\titledocument 3 
title/headbodythis is english/body/html };
 
-  String metalanguages[] = { fi, en, en };
+  // NUTCH-794 : temporarily replaced fi and en with null
+  String metalanguages[] = { null, en, en };
 
   /**
* Test parsing of language identifiers from html 




svn commit: r906907 - in /lucene/nutch/trunk: CHANGES.txt conf/domain-suffixes.xml

2010-02-05 Thread jnioche
Author: jnioche
Date: Fri Feb  5 11:52:57 2010
New Revision: 906907

URL: http://svn.apache.org/viewvc?rev=906907view=rev
Log:
NUTCH-786

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/domain-suffixes.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=906907r1=906906r2=906907view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Feb  5 11:52:57 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-786 Improvement to the list of suffix domains (jnioche)
+
 * NUTCH-775 Enhance searcher interface (siren)
 
 * NUTCH-781 Update Tika to v0.6 (jnioche)

Modified: lucene/nutch/trunk/conf/domain-suffixes.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/domain-suffixes.xml?rev=906907r1=906906r2=906907view=diff
==
--- lucene/nutch/trunk/conf/domain-suffixes.xml (original)
+++ lucene/nutch/trunk/conf/domain-suffixes.xml Fri Feb  5 11:52:57 2010
@@ -1744,6 +1744,16 @@
 suffix domain=retina.ar /
 suffix domain=uba.ar /
 
+suffix domain=com.ar /
+suffix domain=edu.ar /
+suffix domain=gob.ar /
+suffix domain=gov.ar /
+suffix domain=int.ar /
+suffix domain=mil.ar /
+suffix domain=net.ar /
+suffix domain=org.ar /
+suffix domain=tur.ar /
+
 !--  arpa : http://en.wikipedia.org/wiki/.arpa--
 suffix domain=e164.arpa /
 suffix domain=in-addr.arpa /
@@ -1955,6 +1965,14 @@
 
 !--  co : http://en.wikipedia.org/wiki/.co--
 
+suffix domain=com.co /
+suffix domain=org.co /
+suffix domain=edu.co /
+suffix domain=gov.co /
+suffix domain=net.co /
+suffix domain=mil.co /
+suffix domain=nom.co /
+
 !--  com : http://en.wikipedia.org/wiki/.com--
 
 !--  coop : http://en.wikipedia.org/wiki/.coop--
@@ -2215,9 +2233,26 @@
 
 !--  id : http://en.wikipedia.org/wiki/.id--
 
+suffix domain=ac.id /
+suffix domain=co.id /
+suffix domain=net.id /
+suffix domain=or.id /
+suffix domain=web.id /
+suffix domain=sch.id /
+suffix domain=mil.id /
+suffix domain=go.id /
+
 !--  ie : http://en.wikipedia.org/wiki/.ie--
 
 !--  il : http://en.wikipedia.org/wiki/.il--
+   suffix domain=ac.il /
+   suffix domain=co.il /
+   suffix domain=org.il /
+   suffix domain=net.il /
+   suffix domain=k12.il /
+   suffix domain=gov.il /
+   suffix domain=muni.il /
+   suffix domain=idf.il /
 
 !--  im : https://www.nic.im/pdfs/imfaqs.pdf--
 suffix domain=co.im /
@@ -2854,6 +2889,11 @@
 suffix domain=org.mw /
 
 !--  mx : http://www.nic.mx/--
+suffix domain=com.mx /
+suffix domain=edu.mx /
+suffix domain=gob.mx /
+suffix domain=net.mx /
+suffix domain=org.mx /
 
 !--  my : http://www.mynic.net.my/--
 
@@ -3661,6 +3701,19 @@
 !--  nu : http://en.wikipedia.org/wiki/.nu--
 
 !--  nz : http://en.wikipedia.org/wiki/.nz--
+   suffix domain=ac.nz /
+   suffix domain=co.nz /
+   suffix domain=cri.nz /
+   suffix domain=geek.nz /
+   suffix domain=gen.nz /
+   suffix domain=govt.nz /
+   suffix domain=iwi.nz /
+   suffix domain=maori.nz /
+   suffix domain=mil.nz /
+   suffix domain=net.nz /
+   suffix domain=org.nz /
+   suffix domain=parliament.nz /
+   suffix domain=school.nz /
 
 !--  om : http://en.wikipedia.org/wiki/.om--
 
@@ -4344,7 +4397,28 @@
 
 !--  yu : http://www.nic.yu/pravilnik-e.html--
 
-!--  za : http://www.zadna.org.za/slds.html--
+!--  za : http://www.zadna.org.za/slds.html
+   http://en.wikipedia.org/wiki/.za
+ --
+suffix domain=ac.za /
+suffix domain=city.za /
+suffix domain=co.za /
+suffix domain=edu.za /
+suffix domain=gov.za /
+suffix domain=law.za /
+suffix domain=mil.za /
+suffix domain=nom.za /
+suffix domain=org.za /
+suffix domain=school.za /
+suffix domain=ecape.school.za /
+suffix domain=fs.school.za /
+suffix domain=gp.school.za /
+suffix domain=kzn.school.za /
+suffix domain=mpm.school.za /
+suffix domain=ncape.school.za /
+suffix domain=lp.school.za /
+suffix domain=nw.school.za /
+suffix domain=wcape.school.za /
 
 !--  zm : http://en.wikipedia.org/wiki/.zm--
 




svn commit: r905550 [1/2] - /lucene/nutch/trunk/conf/tika-mimetypes.xml

2010-02-02 Thread jnioche
Author: jnioche
Date: Tue Feb  2 09:31:19 2010
New Revision: 905550

URL: http://svn.apache.org/viewvc?rev=905550view=rev
Log:
NUTCH-781 : updated tika-mimetypes.xml

Modified:
lucene/nutch/trunk/conf/tika-mimetypes.xml



svn commit: r905228 - in /lucene/nutch/trunk/lib: tika-core-0.5.jar tika-core-0.6.jar

2010-02-01 Thread jnioche
Author: jnioche
Date: Mon Feb  1 09:59:50 2010
New Revision: 905228

URL: http://svn.apache.org/viewvc?rev=905228view=rev
Log:
NUTCH-781: upgrade tika to version 0.6

Added:
lucene/nutch/trunk/lib/tika-core-0.6.jar   (with props)
Removed:
lucene/nutch/trunk/lib/tika-core-0.5.jar

Added: lucene/nutch/trunk/lib/tika-core-0.6.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.6.jar?rev=905228view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-core-0.6.jar
--
svn:mime-type = application/octet-stream




svn commit: r905229 - /lucene/nutch/trunk/CHANGES.txt

2010-02-01 Thread jnioche
Author: jnioche
Date: Mon Feb  1 10:03:07 2010
New Revision: 905229

URL: http://svn.apache.org/viewvc?rev=905229view=rev
Log:
NUTCH-781: upgrade tika to version 0.6

Modified:
lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=905229r1=905228r2=905229view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Feb  1 10:03:07 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-781 Update Tika to v0.6 (jnioche)
+
 * NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count 
(stack + jnioche)
 
 * NUTCH-655 Injecting Crawl metadata (jnioche)




svn commit: r897825 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/MimeUtil.java test/org/apache/nutch/protocol/TestContent.java

2010-01-11 Thread jnioche
Author: jnioche
Date: Mon Jan 11 10:13:21 2010
New Revision: 897825

URL: http://svn.apache.org/viewvc?rev=897825view=rev
Log:
fix for NUTCH-767 : reverted original expected values for test + treat 
text/plain as a default mime-type from Tika

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=897825r1=897824r2=897825view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Mon Jan 11 
10:13:21 2010
@@ -159,6 +159,7 @@
 if (this.mimeMagic) {
   MimeType magicType = this.mimeTypes.getMimeType(data);
   if (magicType != null  
!magicType.getName().equals(MimeTypes.OCTET_STREAM)
+   !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
type != null  !type.getName().equals(magicType.getName())) {
 // If magic enabled and the current mime type differs from that of the
 // one returned from the magic, take the magic mimeType

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?rev=897825r1=897824r2=897825view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon 
Jan 11 10:13:21 2010
@@ -63,28 +63,19 @@
 http://www.foo.com/;,
 .getBytes(UTF8),
 text/html; charset=UTF-8, p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals(text/plain, c.getContentType());
+assertEquals(text/html, c.getContentType());
 
 c = new Content(http://www.foo.com/foo.html;,
 http://www.foo.com/;,
 .getBytes(UTF8),
 , p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals(text/plain, c.getContentType());
+assertEquals(text/html, c.getContentType());
 
 c = new Content(http://www.foo.com/foo.html;,
 http://www.foo.com/;,
 .getBytes(UTF8),
 null, p, conf);
-// TODO check potential Tika issue and 
-// revert the expected value to text/html
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals(text/plain, c.getContentType());
+assertEquals(text/html, c.getContentType());
 
 c = new Content(http://www.foo.com/;,
 http://www.foo.com/;,
@@ -108,10 +99,7 @@
 http://www.foo.com/;,
 .getBytes(UTF8),
 , p, conf);
-// TODO check that Tika returns the right value and
-// revert to the default type
-// see https://issues.apache.org/jira/browse/NUTCH-767
-assertEquals(text/plain, c.getContentType());
+assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
 
 c = new Content(http://www.foo.com/;,
 http://www.foo.com/;,




svn commit: r897180 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2010-01-08 Thread jnioche
Author: jnioche
Date: Fri Jan  8 12:01:46 2010
New Revision: 897180

URL: http://svn.apache.org/viewvc?rev=897180view=rev
Log:
NUTCH-269 : OOME because no upper-bound on inlinks count (stack + jnioche)

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=897180r1=897179r2=897180view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan  8 12:01:46 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count 
(stack + jnioche)
+
 * NUTCH-655 Injecting Crawl metadata (jnioche)
 
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=897180r1=897179r2=897180view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan  8 12:01:46 2010
@@ -384,6 +384,14 @@
 /property
 
 property
+  namedb.update.max.inlinks/name
+  value1/value
+  descriptionMaximum number of inlinks to take into account when updating 
+  a URL score in the crawlDB. Only the best scoring inlinks are kept. 
+  /description
+/property
+
+property
   namedb.ignore.internal.links/name
   valuetrue/value
   descriptionIf true, when adding new links to a page, links from

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=897180r1=897179r2=897180view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri 
Jan  8 12:01:46 2010
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 import java.io.IOException;
 
 // Commons Logging imports
@@ -27,6 +28,7 @@
 
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.PriorityQueue;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -37,7 +39,7 @@
   
   private int retryMax;
   private CrawlDatum result = new CrawlDatum();
-  private ArrayListCrawlDatum linked = new ArrayListCrawlDatum();
+  private InlinkPriorityQueue linked = null;
   private ScoringFilters scfilters = null;
   private boolean additionsAllowed;
   private int maxInterval;
@@ -51,6 +53,8 @@
 maxInterval = job.getInt(db.fetch.interval.max, 0 );
 if (oldMaxInterval  0  maxInterval == 0) maxInterval = oldMaxInterval * 
FetchSchedule.SECONDS_PER_DAY;
 schedule = FetchScheduleFactory.getFetchSchedule(job);
+int maxLinks = job.getInt(db.update.max.inlinks, 1);
+linked = new InlinkPriorityQueue(maxLinks);
   }
 
   public void close() {}
@@ -111,7 +115,7 @@
 } else {
   link = datum;
 }
-linked.add(link);
+linked.insert(link);
 break;
   case CrawlDatum.STATUS_SIGNATURE:
 signature = datum.getSignature();
@@ -120,13 +124,21 @@
 LOG.warn(Unknown status, key:  + key + , datum:  + datum);
   }
 }
-
+
+// copy the content of the queue into a List
+// in reversed order
+int numLinks = linked.size();
+ListCrawlDatum linkList = new ArrayListCrawlDatum(numLinks);
+for (int i = numLinks - 1; i = 0; i--) {
+  linkList.add(linked.pop());
+}
+
 // if it doesn't already exist, skip it
 if (!oldSet  !additionsAllowed) return;
 
 // if there is no fetched datum, perhaps there is a link
-if (!fetchSet  linked.size()  0) {
-  fetch = linked.get(0);
+if (!fetchSet  linkList.size()  0) {
+  fetch = linkList.get(0);
   fetchSet = true;
 }
 
@@ -260,7 +272,7 @@
 }
 
 try {
-  scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linked);
+  scfilters.updateDbScore((Text)key, oldSet ? old : null, result, 
linkList);
 } catch (Exception e) {
   if (LOG.isWarnEnabled()) {
 LOG.warn(Couldn't update score, key= + key + :  + e);
@@ -270,5 +282,20 @@
 result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
 output.collect(key, result);
   }
+  
+}
 
+class InlinkPriorityQueue extends PriorityQueueCrawlDatum {
+  
+  public InlinkPriorityQueue(int maxSize) {
+initialize(maxSize);
+  }
+  
+  /** Determines the ordering of objects

svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java

2010-01-06 Thread jnioche
Author: jnioche
Date: Wed Jan  6 17:01:51 2010
New Revision: 896539

URL: http://svn.apache.org/viewvc?rev=896539view=rev
Log:
NUTCH-655 : Injecting Crawl metadata (jnioche)

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539r1=896538r2=896539view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan  6 17:01:51 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-655 Injecting Crawl metadata (jnioche)
+
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)
 
 * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539r1=896538r2=896539view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan  6 
17:01:51 2010
@@ -37,10 +37,21 @@
 import org.apache.nutch.util.NutchJob;
 
 /** This class takes a flat file of URLs and adds them to the of pages to be
- * crawled.  Useful for bootstrapping the system. */
+ * crawled.  Useful for bootstrapping the system. 
+ * The URL files contain one URL per line, optionally followed by custom 
metadata 
+ * separated by tabs with the metadata key separated from the corresponding 
value by '='. br
+ * Note that some metadata keys are reserved : br
+ * - inutch.score/i : allows to set a custom score for a specific URL br
+ * - inutch.fetchInterval/i : allows to set a custom fetch interval for a 
specific URL br
+ * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 
\t userType=open_source
+ **/
 public class Injector extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(Injector.class);
-
+  
+  /** metadata key reserved for setting a custom score for a specific URL */
+  public static String nutchScoreMDName = nutch.score;
+  /** metadata key reserved for setting a custom fetchInterval for a specific 
URL */
+  public static String nutchFetchIntervalMDName = nutch.fetchInterval;
 
   /** Normalize and filter injected urls. */
   public static class InjectMapper implements MapperWritableComparable, Text, 
Text, CrawlDatum {
@@ -68,6 +79,36 @@
 OutputCollectorText, CrawlDatum output, Reporter 
reporter)
   throws IOException {
   String url = value.toString();  // value is line of text
+  // if tabs : metadata that could be stored
+  // must be name=value and separated by \t
+  float customScore = -1f;
+  int customInterval = interval;
+  MapString,String metadata = new TreeMapString,String();
+  if (url.indexOf(\t)!=-1){
+ String[] splits = url.split(\t);
+ url = splits[0];
+ for (int s=1;ssplits.length;s++){
+ // find separation between name and value
+ int indexEquals = splits[s].indexOf(=);
+ if (indexEquals==-1) {
+ // skip anything without a =
+ continue; 
+ }
+ String metaname = splits[s].substring(0, indexEquals);
+ String metavalue = splits[s].substring(indexEquals+1);
+ if (metaname.equals(nutchScoreMDName)) {
+ try {
+ customScore = Float.parseFloat(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
+ else if (metaname.equals(nutchFetchIntervalMDName)) {
+ try {
+ customInterval = Integer.parseInt(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
+ else metadata.put(metaname,metavalue);
+ }
+  }
   try {
 url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
 url = filters.filter(url); // filter the url
@@ -77,17 +118,27 @@
   }
   if (url != null) {  // if it passes
 value.set(url);   // collect it
-CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, 
interval);
+CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, 
customInterval);
 datum.setFetchTime(curTime);
-datum.setScore(scoreInjected);
-try {
-  scfilters.injectedScore(value, datum);
-} catch (ScoringFilterException e) {
-  if (LOG.isWarnEnabled()) {
-LOG.warn(Cannot filter

svn commit: r895972 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/parse/ParseSegment.java src/java/org/apache/nutch/protocol/ProtocolSt

2010-01-05 Thread jnioche
Author: jnioche
Date: Tue Jan  5 10:14:49 2010
New Revision: 895972

URL: http://svn.apache.org/viewvc?rev=895972view=rev
Log:
NUTCH-658 : Add Counter for # of doc fetched in Reporter

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=895972r1=895971r2=895972view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jan  5 10:14:49 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-658 Use counters to report fetching and parsing status (jnioche)
+
 * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)
 
 * NUTCH-767 Update Tika to v0.5 for the MimeType detection (Julien Nioche via 
ab)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=895972r1=895971r2=895972view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan  
5 10:14:49 2010
@@ -607,6 +607,7 @@
   LOG.debug(Denied by robots.txt:  + fit.url);
 }
 output(fit.url, fit.datum, null, 
ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+reporter.incrCounter(FetcherStatus, robots_denied, 1);
 continue;
   }
   if (rules.getCrawlDelay()  0) {
@@ -615,6 +616,7 @@
   fetchQueues.finishFetchItem(fit, true);
   LOG.debug(Crawl-Delay for  + fit.url +  too long ( + 
rules.getCrawlDelay() + ), skipping);
   output(fit.url, fit.datum, null, 
ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
+  reporter.incrCounter(FetcherStatus, 
robots_denied_maxcrawldelay, 1);
   continue;
 } else {
   FetchItemQueue fiq = 
fetchQueues.getFetchItemQueue(fit.queueID);
@@ -630,6 +632,8 @@
 
   String urlString = fit.url.toString();
 
+  reporter.incrCounter(FetcherStatus, status.getName(), 1);
+  
   switch(status.getCode()) {
 
   case ProtocolStatus.WOULDBLOCK:
@@ -664,6 +668,7 @@
 } else {
   // stop redirecting
   redirecting = false;
+  reporter.incrCounter(FetcherStatus, 
FetchItem.notCreated.redirect, 1);
 }
   }
 }
@@ -701,6 +706,7 @@
   } else {
 // stop redirecting
 redirecting = false;
+reporter.incrCounter(FetcherStatus, 
FetchItem.notCreated.redirect, 1);
   }
 } else {
   // stop redirecting
@@ -926,6 +932,7 @@
   if (parseResult != null  !parseResult.isEmpty()) {
 Parse p = parseResult.get(content.getUrl());
 if (p != null) {
+  reporter.incrCounter(ParserStatus, 
ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
   return p.getData().getStatus();
 }
   }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=895972r1=895971r2=895972view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue 
Jan  5 10:14:49 2010
@@ -93,6 +93,8 @@
   Parse parse = entry.getValue();
   ParseStatus parseStatus = parse.getData().getStatus();
   
+  reporter.incrCounter(ParserStatus, 
ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+  
   if (!parseStatus.isSuccess()) {
 LOG.warn(Error parsing:  + key + :  + parseStatus);
 parse = parseStatus.getEmptyParse(getConf());

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=895972r1=895971r2=895972view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol

svn commit: r894716 - in /lucene/nutch/trunk: site/credits.html site/credits.pdf src/site/src/documentation/content/xdocs/credits.xml

2009-12-30 Thread jnioche
Author: jnioche
Date: Wed Dec 30 21:34:28 2009
New Revision: 894716

URL: http://svn.apache.org/viewvc?rev=894716view=rev
Log:
Adding J. Nioche to the list of committers

Modified:
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/credits.pdf
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/credits.xml

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.html?rev=894716r1=894715r2=894716view=diff
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Wed Dec 30 21:34:28 2009
@@ -252,6 +252,10 @@
 /li
   
 li
+a href=http://www.digitalpebble.com/;Julien Nioche/a
+/li
+  
+li
 a href=http://people.apache.org/~siren;Sami Siren/a
 /li
   
@@ -261,7 +265,7 @@
 /div
 
 
-a name=N10042/aa name=Friends/a
+a name=N10047/aa name=Friends/a
 h2 class=h3Friends/h2
 div class=section
 ul
@@ -292,7 +296,7 @@
 /div
 
 
-a name=N1008C/aa name=Sponsors/a
+a name=N10091/aa name=Sponsors/a
 h2 class=h3Sponsors/h2
 div class=section
 ul

Modified: lucene/nutch/trunk/site/credits.pdf
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/site/credits.pdf?rev=894716r1=894715r2=894716view=diff
==
--- lucene/nutch/trunk/site/credits.pdf (original)
+++ lucene/nutch/trunk/site/credits.pdf Wed Dec 30 21:34:28 2009
@@ -58,10 +58,10 @@
 
 endobj
 14 0 obj
- /Length 2451 /Filter [ /ASCII85Decode /FlateDecode ]
+ /Length 2523 /Filter [ /ASCII85Decode /FlateDecode ]
  
 stream
-Gat=-?#uMo'RekG6)QN,j4X4IDDVKe8Cd99l3ZjVX=Q[b.BLBhuXiFrUmi*,VH1...@__ei(`FER#:35J-3KkLmIc0$E/-9at+C5'JL_g:M-TJq:j=p'n#r(rJ*A@`ZXIRrcMN2^?jo+TV(?8=o8Z43rXNS`.lKsK^`(anX=FV;m0$Sh;[*WTGKJTabq4PBnG%HT1]HqfD`^fK]+DMGC][A;AauDhV\=BF-6%+--+,-R^Q`J-rq1^/aI!E7A77`*g/j,2T[+;*_3p]F4O:C]]NLjH]*Wa3p$E[tVeq1,Q9`XAoU`^21U3M3(8Y,Y8+d:rLd/?ErprgkLDtPtOH9;iBRt,/sV1!^kLNh#W?Xc;Z=^0N*TO_9#QGEfj)K%-/N_]VP#g)tHj?tSb1CVc1X8#u7Ig^0aenmP)fY)!'l;.LN!;sR5V8_qKq?BssHf[1`%[p=)^/[j['-n/*i...@j8c=+o1v+oasfe;/X[k+:G!Fa`7,=6%UB(je=+0Jk]pJuk0$Fulw0jqci.wtfp?...@jip'*(4IN(nFWUC'4$+ASc0$'Hb0).HOX.nYPOI'ZfjA4lGoAq.OB7pr*@;,dS]W^Y..TVojXN,X0'^8csme]rov...@mrm=7Ume#n129mnp^[\n!AiOV7^V6PJ*$4n5)Ta(oU;!=fZ7KED][iPuL2sG4B2QRR;R8h]pal\kC5TJ`i!B1dgG;M/ZcccB^2R[RqCf=sJoXH_T-PK\n,\QIE;N:-%H:i6IS9GRTL\a#TIr0[=g'YE]#JSS-7VY96Wr.ok]i=r\uB18i+V7Ss'SGHVU[Qb5/Q5TWMK3h(]eqes3?1^...@ad(So.oH1',$NU^Vd0*s7V-7,/:Z[3MHCgmC(j-Qr=m,Sk0?Vek)h6``t6vuk#7h5gqp...@p.[pq#d3goh`ulzV]:Xb:Hp^fcB(!R
 
F73]PiP4t\GcOq?o]khE6Y22'f]RU%Dng'BBRjXCeQL1A4N6Sf...@aa$iphpn0eW`2/dlc)_DpfE4...@s(+[R;P?Rin#F'!O_VKp/mC^:W\HgRJ*)iCHjY0+n`Fo^;UV,l...@r/P-0YPe`DIbReP7Ok^:5L1ee7)3\(au1D0)OB\]4nLeh+trE:]o[ep'HMp0S6f*#u`m_=1)$r+/?0fr%8)ZF4M9-8$qiO8pl$RTRX;'t5i!`R=2hX%*)iE^Beb'hN'B_]]Q9K$mCB;OKPjEB#qBR8FHImcT#TBgL.Y1X%WTY=K?+bCYbdm?3_Q,4o=AA(i]nk\S*G)?IJnPSJACaqk81+WAU$53q%...@1-fj`h8#K`Kbreb;?jHe0.q[!7dAKB7am6Gc+o6UEhb-kk2j:)X;T.C0o[$Osp`HGL*)*aqOa!...@u:!nGBQoda#s8BMf2l]$XQn0D%X)b'213CgRMXW+^tAD'KIJ5,Sk/mc7!*...@$xk*%*p=z.(r-Vcj#EOrcOdWY'T@sLig2Y'taK-,a-qM[rFHonKAtmbo1ejl28\D7UmR'edeKrfRJNN6G'FB5XBB=$3rGK9B(0DX,]nr`d...@`-q)pJ?'%5m-cho3$21...@l!4?0yck3^lHrB4+UOCPg#tMPmHhVcMDK0f-5e+uxw]j#...@qs-%sbew.jf15ga\^`,1S+C=C-df;1RW])_L(N8X,+O2I;PjoBEdqj[4Bb`o8s'nn'=NZ0ZN-f]T05Hk/tG(2[H)qDrNDb0!qqU^G'Zp4:7+dW)^0sm#,\V;7V0cjr1?65,n5neQ]uX,I?i]`03tf*jceD7Ai2le5*K]0;mM5YCq`!ofr]a8\.SR$.PQBW:iuM@:=...@o5j8t`m*m[rha9?iqa7'0q\7-8L)l5hKBnL'aVDO%.'DiEo(PG2^...@]i$qlml01\/
 
2B4k5s50%$QKEcg-JXBIlL(H]XQic*j4!D2]JL2)q4`D](P7bS1u/um*'dp)f%/qYmj-Z?$AUlBL1%XlNB'tb/EI[Hn+K8[3Y8$Ind)?+Q^H,QmfH2%'N%_oPR?e$KQgk$dJLFe)cW2K6.Q%P8Zo1kRAhX59n?j...@mez0l[inp$=f$knplk[)_Je...@\!bxq#rkbmc5ss!-'*2##SYi$'OlZ.b%8Sra-l^V8:$KYrhDcQEg/WH3R`D=psS7?oce?FP[7Pe_$g8ck,p+RDZPcD%]pJJI,k...@j3h?p5n:o%3krIkL#pRsD#!?h]F0FGL\q2oJ5:7u3j.f.!$/5=L,M8k5bmiXYO9tD%($\s6U-#QdabpH@gbM8BONu7[aH:N\^fLQb6,C?YISJ#LRlDar#mfYU4Fk*_:IRVLV?eD)?r#-0;hW[7M/pHR5ZLdq~
+Gat=.gNfR:NH+I/skrF!LtiR*u_M./K*UroO.4X$)QZs;\?nFHIT,qam^^^`%^vg*j...@ah*$digos4s'*4h9'\kgPU0t;o5m7...@1eb.1$uf_9g1=lniw#-T^ZjK7)qgDMsZE.@:Ig7nKWul4[s+7AY5@:h,KSD5E!GsAo+\kBCPVTaQrORMGYs!1bM23u/!\(HFEC#eiN5U\Qq3s-...@fq1r(m%Ktb[WS\4ue)+/C6=IN_'o$_khc92e...@$[$81.ha[n4eqh1jn@Wat_:h8CACsU_H6qf==ak]S$1Q#_\,o.Cdrh[?V9EYAl2ZQojP:a...@apgbsf34l6d%25@$;eS!,MWHdkHWlp-u0hZ[1MHe$:Oq,mjh[08gu1...@?no=hc0'?1E4O_25*I!Cu'o.d0]$I4o`?b6r5R[VqJHtTH0J\W](IW#f3,huj]v]t5mn...@\j:0YBkS%m8=K5g;C0^XQEJU[E1Pf:kk-Zq_e+Mf4PFQ`h2Y188-k]p#g(@,5VfseCbE?F:`3(CDk$A_c#$'6^EnKoo'=DVP[cj(1`Z!/GLqMlI'_PZ=j'(/nk...@fmq3\jo#r1]S]l-i,?Ph-``Gi-s;WK:i$e4Z``]21SYbbc$nE5rp]=[@(H\rE*[qO7#Ynt4#%'?4'qE87L)qbI0Jpm,!pItf#5'$l$ec7b_j,l[#2co1DBg(Z_cUknM!=eoouHLA/R09=REWfcsp#HaX$g%+;k.b)A5`W!ateTpHt/+0leNc/VuN[:mh^84EM8?!]/Z6e'(ch95enOV7h.L'?p:(esRj9'XYQ4`BF1r1#H(tG6F0S8$epBAbhb6Oiboc`RZIEJ%0XX*Wc+Z'O$.q\)u=Ws3F:cg9n]iv[...@+4udv.e-b:N7`0^BeYXnBmjdfWIcI0p^]7XrK7f)7o8Cb9DNcU*5f)Y$
 
fZ/@DdNgr_D,g...@iup?phkyk5$kx!m0_fthr=hu