svn commit: r745416 - /lucene/nutch/trunk/build.xml

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 08:11:46 2009
New Revision: 745416

URL: http://svn.apache.org/viewvc?rev=745416&view=rev
Log:
NUTCH-687 add RAT

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745416&r1=745415&r2=745416&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb 18 08:11:46 2009
@@ -610,4 +610,23 @@
 
   
 
+  
+  
+  
+  
+
+  
+
+  
+
+  
+
+  
+
+  
+  
+
+  
+   
 




svn commit: r745446 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/ plugin/field-boost/src/java/org/apache/nutch/indexer/fie

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 09:14:29 2009
New Revision: 745446

URL: http://svn.apache.org/viewvc?rev=745446&view=rev
Log:
NUTCH-688 add missing headers, part 2 rest

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java

lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java

lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java

lucene/nutch/trunk/src/plugin/response-json/src/java/org/apache/nutch/searcher/response/json/JSONResponseWriter.java

lucene/nutch/trunk/src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java

lucene/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java

lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java

lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 (original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
 Wed Feb 18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.util;
 
 import java.io.DataInput;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Wed Feb 
18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.util;
 
 import java.util.Stack;

Modified: 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=745446&r1=745445&r2=745446&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
 Wed Feb 18 09:14:29 2009
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except i

svn commit: r745448 - /lucene/nutch/trunk/build.xml

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 09:18:07 2009
New Revision: 745448

URL: http://svn.apache.org/viewvc?rev=745448&view=rev
Log:
NUTCH-687 add RAT, also check plugins

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745448&r1=745447&r2=745448&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Feb 18 09:18:07 2009
@@ -624,7 +624,9 @@
   
 
-  
+  
+   
+   
   
 
   




svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/ src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ sr

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 12:43:04 2009
New Revision: 745499

URL: http://svn.apache.org/viewvc?rev=745499&view=rev
Log:
NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed 
by Dmitry Lihachev

Added:

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar   
(with props)

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
   (with props)
Removed:

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar

lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
lucene/nutch/trunk/src/plugin/parse-msword/build.xml

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java

lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java

lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009
@@ -343,6 +343,9 @@
 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException
  (Stefan Will, siren)
  
+129. NUTCH-691 - Update jakarta poi jars to the most relevant version
+ (Dmitry Lihachev via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Added: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar
--
svn:mime-type = application/octet-stream

Added: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto
==
Binary file - no diff available.

Propchange: 
lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 
12:43:04 2009
@@ -29,10 +29,10 @@
provider-name="jakarta.apache.org">
 

- 
+ 
 
  
- 
+ 
 
  


Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff
==
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 
2009
@@ -44,7 +44,8 @@
 
   
   
-  
-  
+  
+
+  
 
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
 Wed Feb 18 12:43:04 2009
@@ -53,8 +53,9 @@
 int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
 
 // get a list of character properties
+
 Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, 
chpTableOffset,
-  chpTableSize, fcMin);
+  chpTableSize, fcMin, new TextPieceTable());
 List textRuns = chp

svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 12:53:12 2009
New Revision: 745503

URL: http://svn.apache.org/viewvc?rev=745503&view=rev
Log:
NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien 
Nioche

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009
@@ -346,6 +346,9 @@
 129. NUTCH-691 - Update jakarta poi jars to the most relevant version
  (Dmitry Lihachev via siren)
 
+130. NUTCH-563 - Include custom fields in BasicQueryFilter
+ (Julien Nioche via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009
@@ -1119,6 +1119,15 @@
   
 
 
+
+
 
 
 

Modified: 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
 Wed Feb 18 12:53:12 2009
@@ -22,6 +22,13 @@
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermQuery;
 
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 import org.apache.nutch.analysis.CommonGrams;
 
@@ -31,7 +38,12 @@
 import org.apache.hadoop.conf.Configuration;
 
 /** The default query filter.  Query terms in the default query field are
- * expanded to search the url, anchor and content document fields.*/
+ * expanded to search the url, anchor and content document fields.
+ * Additional fields can be added by specifying parameters of the form : 
query.basic.(fieldname).boost
+ * to the configuration files (see nutch-default.xml for an example).Such 
fields will be used in the clauses
+ * generated by the BasicQueryFilter e.g. for a user query A B, it generates 
+(field1:A field2:A ...) +(field1:B field2:B).
+ * If you don't want the additional fields to be included in the clauses you 
will need to implement a custom query filter for it.
+ **/
 public class BasicQueryFilter implements QueryFilter {
 
   private static final int  URL_BOOST   = 0;
@@ -44,7 +56,7 @@
 
   private float PHRASE_BOOST;
 
-  private static final String[] FIELDS =
+  private String[] FIELDS =
   { "url", "anchor", "content", "title", "host" };
 
   private float[] FIELD_BOOSTS = new float[5];
@@ -177,9 +189,51 @@
 this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
 this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
 this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
+findAdditionalFields(conf);
   }
 
   public Configuration getConf() {
 return this.conf;
   }
+  
+  /** Searches for parameters of the form : query.basic.(fieldname).boost
+   * and adds the fielname to the list of default fields.
+   **/
+  private void findAdditionalFields(Configuration conf) {
+// get additional fields specified in parameters
+Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
+Iterator confEntriesIterator = conf.iterator(); 
+List existingFields = java.util.Arrays.asList(FIELDS);  
+ArrayList tempfieldNames = new ArrayList();
+ArrayList tempfieldBoosts = new ArrayList();
+while (confEntriesIterator.hasNext()){
+  Map.Entry entry = (Map.Entry) confEntriesIterator.next();
+  String key = entry.getKey().toString();
+  Matcher match = pat.matcher(key);
+  if (!match.matches())continue;
+  String fieldName = match.group(1);
+  if (fieldName!=null){
+// check whether it matches one of the fields which are used by default
+if (existingFields.contains(fieldName)) continue;
+// reserved keyword
+if (fieldName.equa

svn commit: r745517 - /lucene/nutch/trunk/contrib/web2/

2009-02-18 Thread siren
Author: siren
Date: Wed Feb 18 14:03:18 2009
New Revision: 745517

URL: http://svn.apache.org/viewvc?rev=745517&view=rev
Log:
remove web2 as agreed on nutch-dev

Removed:
lucene/nutch/trunk/contrib/web2/