svn commit: r745416 - /lucene/nutch/trunk/build.xml
Author: siren Date: Wed Feb 18 08:11:46 2009 New Revision: 745416 URL: http://svn.apache.org/viewvc?rev=745416&view=rev Log: NUTCH-687 add RAT Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745416&r1=745415&r2=745416&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 18 08:11:46 2009 @@ -610,4 +610,23 @@ + + + + + + + + + + + + + + + + + +
svn commit: r745446 - in /lucene/nutch/trunk/src: java/org/apache/nutch/util/ plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/ plugin/field-boost/src/java/org/apache/nutch/indexer/fie
Author: siren Date: Wed Feb 18 09:14:29 2009 New Revision: 745446 URL: http://svn.apache.org/viewvc?rev=745446&view=rev Log: NUTCH-688 add missing headers, part 2 rest Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java lucene/nutch/trunk/src/plugin/response-json/src/java/org/apache/nutch/searcher/response/json/JSONResponseWriter.java lucene/nutch/trunk/src/plugin/response-xml/src/java/org/apache/nutch/searcher/response/xml/XMLResponseWriter.java lucene/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java lucene/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java lucene/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.util; import java.io.DataInput; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.util; import java.util.Stack; Modified: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=745446&r1=745445&r2=745446&view=diff == --- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (original) +++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Wed Feb 18 09:14:29 2009 @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except i
svn commit: r745448 - /lucene/nutch/trunk/build.xml
Author: siren Date: Wed Feb 18 09:18:07 2009 New Revision: 745448 URL: http://svn.apache.org/viewvc?rev=745448&view=rev Log: NUTCH-687 add RAT, also check plugins Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?rev=745448&r1=745447&r2=745448&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Feb 18 09:18:07 2009 @@ -624,7 +624,9 @@ - + + +
svn commit: r745499 - in /lucene/nutch/trunk: ./ src/plugin/lib-jakarta-poi/ src/plugin/lib-jakarta-poi/lib/ src/plugin/parse-msword/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ sr
Author: siren Date: Wed Feb 18 12:43:04 2009 New Revision: 745499 URL: http://svn.apache.org/viewvc?rev=745499&view=rev Log: NUTCH-691 - Update jakarta poi jars to the most relevant version, contributed by Dmitry Lihachev Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar (with props) lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar (with props) Removed: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.0-alpha1-20050704.jar lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.0-alpha1-20050704.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml lucene/nutch/trunk/src/plugin/parse-msword/build.xml lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:43:04 2009 @@ -343,6 +343,9 @@ 128. NUTCH-631 - MoreIndexingFilter fails with NoSuchElementException (Stefan Will, siren) +129. NUTCH-691 - Update jakarta poi jars to the most relevant version + (Dmitry Lihachev via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar?rev=745499&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-3.5-beta4-20081128.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar?rev=745499&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/lib/poi-scratchpad-3.5-beta4-20081128.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/lib-jakarta-poi/plugin.xml Wed Feb 18 12:43:04 2009 @@ -29,10 +29,10 @@ provider-name="jakarta.apache.org"> - + - + Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Wed Feb 18 12:43:04 2009 @@ -44,7 +44,8 @@ - - + + + Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java?rev=745499&r1=745498&r2=745499&view=diff == --- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java Wed Feb 18 12:43:04 2009 @@ -53,8 +53,9 @@ int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties + Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, - chpTableSize, fcMin); + chpTableSize, fcMin, new TextPieceTable()); List textRuns = chp
svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
Author: siren Date: Wed Feb 18 12:53:12 2009 New Revision: 745503 URL: http://svn.apache.org/viewvc?rev=745503&view=rev Log: NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien Nioche Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009 @@ -346,6 +346,9 @@ 129. NUTCH-691 - Update jakarta poi jars to the most relevant version (Dmitry Lihachev via siren) +130. NUTCH-563 - Include custom fields in BasicQueryFilter + (Julien Nioche via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009 @@ -1119,6 +1119,15 @@ + + Modified: lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff == --- lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original) +++ lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Feb 18 12:53:12 2009 @@ -22,6 +22,13 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.analysis.CommonGrams; @@ -31,7 +38,12 @@ import org.apache.hadoop.conf.Configuration; /** The default query filter. Query terms in the default query field are - * expanded to search the url, anchor and content document fields.*/ + * expanded to search the url, anchor and content document fields. + * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost + * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses + * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B). + * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it. + **/ public class BasicQueryFilter implements QueryFilter { private static final int URL_BOOST = 0; @@ -44,7 +56,7 @@ private float PHRASE_BOOST; - private static final String[] FIELDS = + private String[] FIELDS = { "url", "anchor", "content", "title", "host" }; private float[] FIELD_BOOSTS = new float[5]; @@ -177,9 +189,51 @@ this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); +findAdditionalFields(conf); } public Configuration getConf() { return this.conf; } + + /** Searches for parameters of the form : query.basic.(fieldname).boost + * and adds the fielname to the list of default fields. + **/ + private void findAdditionalFields(Configuration conf) { +// get additional fields specified in parameters +Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost"); +Iterator confEntriesIterator = conf.iterator(); +List existingFields = java.util.Arrays.asList(FIELDS); +ArrayList tempfieldNames = new ArrayList(); +ArrayList tempfieldBoosts = new ArrayList(); +while (confEntriesIterator.hasNext()){ + Map.Entry entry = (Map.Entry) confEntriesIterator.next(); + String key = entry.getKey().toString(); + Matcher match = pat.matcher(key); + if (!match.matches())continue; + String fieldName = match.group(1); + if (fieldName!=null){ +// check whether it matches one of the fields which are used by default +if (existingFields.contains(fieldName)) continue; +// reserved keyword +if (fieldName.equa
svn commit: r745517 - /lucene/nutch/trunk/contrib/web2/
Author: siren Date: Wed Feb 18 14:03:18 2009 New Revision: 745517 URL: http://svn.apache.org/viewvc?rev=745517&view=rev Log: remove web2 as agreed on nutch-dev Removed: lucene/nutch/trunk/contrib/web2/