Author: lewismc Date: Tue Aug 18 21:19:07 2015 New Revision: 1696506 URL: http://svn.apache.org/r1696506 Log: NUTCH-1486 Upgrade to Solr 4.10.2
Added: nutch/trunk/src/plugin/index-geoip/build-ivy.xml - copied, changed from r1693938, nutch/trunk/src/plugin/parse-tika/build-ivy.xml nutch/trunk/src/plugin/indexer-solr/build-ivy.xml nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml - copied, changed from r1693468, nutch/trunk/src/plugin/parse-tika/build-ivy.xml Removed: nutch/trunk/conf/schema-solr4.xml Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/log4j.properties nutch/trunk/conf/nutch-default.xml nutch/trunk/conf/schema.xml nutch/trunk/ivy/ivy.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java nutch/trunk/src/plugin/index-geoip/ivy.xml nutch/trunk/src/plugin/index-geoip/plugin.xml nutch/trunk/src/plugin/indexer-solr/ivy.xml nutch/trunk/src/plugin/indexer-solr/plugin.xml nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java nutch/trunk/src/plugin/parse-tika/ivy.xml nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Aug 18 21:19:07 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus) + * NUTCH-2048 parse-tika: fix dependencies in plugin.xml (Michael Joyce via snagel) * NUTCH-2066 Parameterize Generate REST endpoint (Sujen Shah via mattmann) Modified: nutch/trunk/conf/log4j.properties URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/conf/log4j.properties (original) +++ nutch/trunk/conf/log4j.properties Tue Aug 18 21:19:07 2015 @@ -44,10 +44,8 @@ log4j.logger.org.apache.nutch.crawl.Craw log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrIndexer=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout -log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout +log4j.logger.org.apache.nutch.indexwriter.solr.SolrIndexWriter=INFO,cmdstdout +log4j.logger.org.apache.nutch.indexwriter.solr.SolrUtils-INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Aug 18 21:19:07 2015 @@ -1615,6 +1615,18 @@ CAUTION: Set the parser.timeout to -1 or </property> <!-- solr index properties --> + +<property> + <name>solr.server.type</name> + <value>http</value> + <description> + Specifies the SolrServer implementation to use. This is a string value + of one of the following 'cloud', 'concurrent', 'http' or 'lb'. + The values represent CloudSolrServer, ConcurrentUpdateSolrServer, + HttpSolrServer or LBHttpSolrServer respectively. + </description> +</property> + <property> <name>solr.server.url</name> <value>http://127.0.0.1:8983/solr/</value> @@ -1624,6 +1636,25 @@ CAUTION: Set the parser.timeout to -1 or </description> </property> +<property> + <name>solr.zookeeper.url</name> + <value></value> + <description> + Defines the Zookeeper URL which is an essential setting to be used + when using SolrCloud. This should be a fully qualified URL similar to + the property provided within 'solr.server.url' above. + </description> +</property> + +<property> + <name>solr.loadbalance.urls</name> + <value></value> + <description> + A comma-seperated value representing the Solr servers to be used when + initiating LBHttpSolrServer as the SolrServer implementation. + </description> +</property> + <property> <name>solr.mapping.file</name> <value>solrindex-mapping.xml</value> Modified: nutch/trunk/conf/schema.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/conf/schema.xml (original) +++ nutch/trunk/conf/schema.xml Tue Aug 18 21:19:07 2015 @@ -1,72 +1,308 @@ <?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + Description: This document contains Solr 4.x schema definition to + be used with Solr integration currently built into Nutch. + This schema is not minimal, there are some useful field type definitions left, + and the set of fields and their flags (indexed/stored/term vectors) can be + further optimized depending on needs. See + http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup + for more info. +--> + +<schema name="nutch" version="1.5"> + + <types> + + <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> + <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> + + <fieldtype name="binary" class="solr.BinaryField"/> + + <!-- - Licensed to the Apache Software Foundation (ASF) under one or - more contributor license agreements. See the NOTICE file - distributed with this work for additional information regarding - copyright ownership. The ASF licenses this file to You under the - Apache License, Version 2.0 (the "License"); you may not use - this file except in compliance with the License. You may obtain - a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 Unless required by - applicable law or agreed to in writing, software distributed - under the License is distributed on an "AS IS" BASIS, WITHOUT - WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions - and limitations under the License. + Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. --> + <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <!-- - Description: This document contains Solr 3.1 schema definition to - be used with Solr integration currently build into Nutch. See - https://issues.apache.org/jira/browse/NUTCH-442 - https://issues.apache.org/jira/browse/NUTCH-699 - https://issues.apache.org/jira/browse/NUTCH-994 - https://issues.apache.org/jira/browse/NUTCH-997 - https://issues.apache.org/jira/browse/NUTCH-1058 - https://issues.apache.org/jira/browse/NUTCH-1232 - and - http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/ - example/solr/conf/schema.xml?view=markup - for more info. + Numeric field types that index each value at various levels of precision + to accelerate range queries when the number of values between the range + endpoints is large. See the javadoc for NumericRangeQuery for internal + implementation details. + + Smaller precisionStep values (specified in bits) will lead to more tokens + indexed per value, slightly larger index size, and faster range queries. + A precisionStep of 0 disables indexing at different precision levels. --> -<schema name="nutch" version="1.5"> - <types> - <fieldType name="string" class="solr.StrField" sortMissingLast="true" - omitNorms="true"/> - <fieldType name="long" class="solr.TrieLongField" precisionStep="0" - omitNorms="true" positionIncrementGap="0"/> - <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" - omitNorms="true" positionIncrementGap="0"/> - <fieldType name="date" class="solr.TrieDateField" precisionStep="0" - omitNorms="true" positionIncrementGap="0"/> - <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> - <fieldtype name="binary" class="solr.BinaryField"/> - - <fieldType name="text" class="solr.TextField" - positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.StopFilterFactory" - ignoreCase="true" words="stopwords.txt"/> - <filter class="solr.WordDelimiterFilterFactory" - generateWordParts="1" generateNumberParts="1" - catenateWords="1" catenateNumbers="1" catenateAll="0" - splitOnCaseChange="1"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.EnglishPorterFilterFactory" - protected="protwords.txt"/> - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> - </analyzer> - </fieldType> - <fieldType name="url" class="solr.TextField" - positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.StandardTokenizerFactory"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.WordDelimiterFilterFactory" - generateWordParts="1" generateNumberParts="1"/> - </analyzer> - </fieldType> - <!-- boolean type: "true" or "false" --> + <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + + <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and + is a more restricted form of the canonical representation of dateTime + http://www.w3.org/TR/xmlschema-2/#dateTime + The trailing "Z" designates UTC time and is mandatory. + Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z + All other components are mandatory. + + Expressions can also be used to denote calculations that should be + performed relative to "NOW" to determine the value, ie... + + NOW/HOUR + ... Round to the start of the current hour + NOW-1DAY + ... Exactly 1 day prior to now + NOW/DAY+6MONTHS+3DAYS + ... 6 months and 3 days in the future from the start of + the current day + + Consult the DateField javadocs for more information. + + Note: For faster range queries, consider the tdate type + --> + <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> + + <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> + + <!-- A Trie based date field for faster date range queries and date faceting. --> + <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> + + + <!-- solr.TextField allows the specification of custom text analyzers + specified as a tokenizer and a list of token filters. Different + analyzers may be specified for indexing and querying. + + The optional positionIncrementGap puts space between multiple fields of + this type on the same document, with the purpose of preventing false phrase + matching across fields. + + For more info on customizing your analyzer chain, please see + http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters + --> + + <!-- A general text field that has reasonable, generic + cross-language defaults: it tokenizes with StandardTokenizer, + removes stop words from case-insensitive "stopwords.txt" + (empty by default), and down cases. At query time only, it + also applies synonyms. --> + <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English: it + tokenizes with StandardTokenizer, removes English stop words + (stopwords.txt), down cases, protects words from protwords.txt, and + finally applies Porter's stemming. The query time analyzer + also applies synonyms from synonyms.txt. --> + <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.EnglishPossessiveFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory: + <filter class="solr.EnglishMinimalStemFilterFactory"/> + --> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- A text field with defaults appropriate for English, plus + aggressive word-splitting and autophrase features enabled. + This field is just like text_en, except it adds + WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and + non-alphanumeric chars. This means certain compound word + cases will work, for example query "wi fi" will match + document "WiFi" or "wi-fi". However, other cases will still + not match, for example if the query is "wifi" and the + document is "wi fi" or if the query is "wi-fi" and the + document is "wifi". + --> + <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- in this example, we will only use synonyms at query time + <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> + --> + <!-- Case insensitive stop word removal. + add enablePositionIncrements=true in both the index and query + analyzers to leave a 'gap' for more accurate phrase queries. + --> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" + ignoreCase="true" + words="stopwords.txt" + enablePositionIncrements="true" + /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Less flexible matching, but less false matches. Probably not ideal for product names, + but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.EnglishMinimalStemFilterFactory"/> + <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes + possible with WordDelimiterFilter in conjuncton with stemming. --> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + + <!-- Just like text_general except it reverses the characters of + each token, to enable more efficient leading wildcard queries. --> + <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" + maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> + </analyzer> + </fieldtype> + + <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <!-- + The DelimitedPayloadTokenFilter can put payloads on tokens... for example, + a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f + Attributes of the DelimitedPayloadTokenFilterFactory : + "delimiter" - a one character delimiter. Default is | (pipe) + "encoder" - how to encode the following value into a playload + float -> org.apache.lucene.analysis.payloads.FloatEncoder, + integer -> o.a.l.a.p.IntegerEncoder + identity -> o.a.l.a.p.IdentityEncoder + Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. + --> + <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> + </analyzer> + </fieldtype> + + <!-- lowercases the entire field value, keeping it as a single token. --> + <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory" /> + </analyzer> + </fieldType> + + <fieldType name="url" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/> + </analyzer> + </fieldType> + + + <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.PathHierarchyTokenizerFactory"/> + </analyzer> + </fieldType> + + <!-- since fields of this type are by default not stored or indexed, + any data added to them will be ignored outright. --> + <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> + + <!-- boolean type: "true" or "false" --> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/> <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are @@ -83,120 +319,112 @@ - If sortMissingLast="false" and sortMissingFirst="false" (the default), then default lucene sorting will be used which places docs without the field first in an ascending sort and last in a descending sort. - --> - </types> - <fields> - <field name="id" type="string" stored="true" indexed="true" - required="true"/> - - <field name="text" type="text" stored="false" indexed="true" multiValued="true"/> - - <!-- core fields --> - <field name="segment" type="string" stored="true" indexed="false"/> - <field name="digest" type="string" stored="true" indexed="false"/> - <field name="boost" type="float" stored="true" indexed="false"/> - - <!-- fields for index-basic plugin --> - <field name="host" type="string" stored="false" indexed="true"/> - <field name="url" type="url" stored="true" indexed="true"/> - <field name="content" type="text" stored="false" indexed="true"/> - <field name="title" type="text" stored="true" indexed="true"/> - <field name="cache" type="string" stored="true" indexed="false"/> - <field name="tstamp" type="date" stored="true" indexed="false"/> - - <!-- fields for index-geoip plugin --> - <field name="ip" type="string" stored="true" indexed="true" /> - <field name="cityName" type="string" stored="true" indexed="true" /> - <field name="cityConfidence" type="int" stored="true" indexed="true" /> - <field name="cityGeoNameId" type="int" stored="true" indexed="true" /> - <field name="continentCode" type="string" stored="true" indexed="true" /> - <field name="continentGeoNameId" type="int" stored="true" indexed="true" /> - <field name="contentName" type="string" stored="true" indexed="true" /> - <field name="countryIsoCode" type="string" stored="true" indexed="true"/> - <field name="countryName" type="string" stored="true" indexed="true" /> - <field name="countryConfidence" type="int" stored="true" indexed="true"/> - <field name="countryGeoNameId" type="int" stored="true" indexed="true"/> - <field name="latLon" type="string" stored="true" indexed="true"/> - <field name="accRadius" type="int" stored="true" indexed="true"/> - <field name="timeZone" type="string" stored="true" indexed="true"/> - <field name="metroCode" type="int" stored="true" indexed="true" /> - <field name="postalCode" type="string" stored="true" indexed="true" /> - <field name="postalConfidence" type="int" stored="true" indexed="true" /> - <field name="countryType" type="string" stored="true" indexed="true" /> - <field name="subDivName" type="string" stored="true" indexed="true" /> - <field name="subDivIsoCode" type="string" stored="true" indexed="true" /> - <field name="subDivConfidence" type="int" stored="true" indexed="true" /> - <field name="subDivGeoNameId" type="int" stored="true" indexed="true" /> - <field name="autonSystemNum" type="int" stored="true" indexed="true" /> - <field name="autonSystemOrg" type="string" stored="true" indexed="true" /> - <field name="domain" type="string" stored="true" indexed="true" /> - <field name="isp" type="string" stored="true" indexed="true" /> - <field name="org" type="string" stored="true" indexed="true" /> - <field name="userType" type="string" stored="true" indexed="true" /> - <field name="isAnonProxy" type="boolean" stored="true" indexed="true" /> - <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" /> - <field name="connType" type="string" stored="true" indexed="true" /> - - - - <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/> - - <!-- fields for index-anchor plugin --> - <field name="anchor" type="string" stored="true" indexed="true" - multiValued="true"/> - - <!-- fields for index-more plugin --> - <field name="type" type="string" stored="true" indexed="true" - multiValued="true"/> - <field name="contentLength" type="long" stored="true" - indexed="false"/> - <field name="lastModified" type="date" stored="true" - indexed="false"/> - <field name="date" type="date" stored="true" indexed="true"/> - - <!-- fields for languageidentifier plugin --> - <field name="lang" type="string" stored="true" indexed="true"/> - - <!-- fields for subcollection plugin --> - <field name="subcollection" type="string" stored="true" - indexed="true" multiValued="true"/> - - <!-- fields for feed plugin (tag is also used by microformats-reltag)--> - <field name="author" type="string" stored="true" indexed="true"/> - <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> - <field name="feed" type="string" stored="true" indexed="true"/> - <field name="publishedDate" type="date" stored="true" - indexed="true"/> - <field name="updatedDate" type="date" stored="true" - indexed="true"/> - - <!-- fields for creativecommons plugin --> - <field name="cc" type="string" stored="true" indexed="true" - multiValued="true"/> - - <!-- fields for tld plugin --> - <field name="tld" type="string" stored="false" indexed="false"/> - - <!-- field containing segment's raw binary content if indexed with -addBinaryContent --> -+ <field name="binaryContent" type="binary" stored="true" indexed="false"/> - - <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler --> - <field name="_version_" type="long" indexed="true" stored="true"/> - - </fields> - <uniqueKey>id</uniqueKey> - <defaultSearchField>content</defaultSearchField> - <solrQueryParser defaultOperator="OR"/> - - <!-- copyField commands copy one field to another at the time a document - is added to the index. It's used either to index the same field differently, - or to add multiple fields to the same field for easier/faster searching. - --> - <copyField source="content" dest="text"/> - <copyField source="url" dest="text"/> - <copyField source="title" dest="text"/> - <copyField source="anchor" dest="text"/> - <copyField source="author" dest="text"/> - <copyField source="latLon" dest="location"/> + --> + + </types> + <fields> + <field name="id" type="string" stored="true" indexed="true" required="true"/> + <field name="_version_" type="long" indexed="true" stored="true"/> + + <!-- core fields --> + <field name="segment" type="string" stored="true" indexed="false"/> + <field name="digest" type="string" stored="true" indexed="false"/> + <field name="boost" type="float" stored="true" indexed="false"/> + + <!-- fields for index-basic plugin --> + <field name="host" type="url" stored="false" indexed="true"/> + <field name="url" type="url" stored="true" indexed="true"/> + <!-- stored=true for highlighting, use term vectors and positions for fast highlighting --> + <field name="content" type="text_general" stored="true" indexed="true"/> + <field name="title" type="text_general" stored="true" indexed="true"/> + <field name="cache" type="string" stored="true" indexed="false"/> + <field name="tstamp" type="date" stored="true" indexed="false"/> + + <!-- fields for index-geoip plugin --> + <field name="ip" type="string" stored="true" indexed="true" /> + <field name="cityName" type="string" stored="true" indexed="true" /> + <field name="cityConfidence" type="int" stored="true" indexed="true" /> + <field name="cityGeoNameId" type="int" stored="true" indexed="true" /> + <field name="continentCode" type="string" stored="true" indexed="true" /> + <field name="continentGeoNameId" type="int" stored="true" indexed="true" /> + <field name="contentName" type="string" stored="true" indexed="true" /> + <field name="countryIsoCode" type="string" stored="true" indexed="true"/> + <field name="countryName" type="string" stored="true" indexed="true" /> + <field name="countryConfidence" type="int" stored="true" indexed="true"/> + <field name="countryGeoNameId" type="int" stored="true" indexed="true"/> + <field name="latLon" type="string" stored="true" indexed="true"/> + <field name="accRadius" type="int" stored="true" indexed="true"/> + <field name="timeZone" type="string" stored="true" indexed="true"/> + <field name="metroCode" type="int" stored="true" indexed="true" /> + <field name="postalCode" type="string" stored="true" indexed="true" /> + <field name="postalConfidence" type="int" stored="true" indexed="true" /> + <field name="countryType" type="string" stored="true" indexed="true" /> + <field name="subDivName" type="string" stored="true" indexed="true" /> + <field name="subDivIsoCode" type="string" stored="true" indexed="true" /> + <field name="subDivConfidence" type="int" stored="true" indexed="true" /> + <field name="subDivGeoNameId" type="int" stored="true" indexed="true" /> + <field name="autonSystemNum" type="int" stored="true" indexed="true" /> + <field name="autonSystemOrg" type="string" stored="true" indexed="true" /> + <field name="domain" type="string" stored="true" indexed="true" /> + <field name="isp" type="string" stored="true" indexed="true" /> + <field name="org" type="string" stored="true" indexed="true" /> + <field name="userType" type="string" stored="true" indexed="true" /> + <field name="isAnonProxy" type="boolean" stored="true" indexed="true" /> + <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" /> + <field name="connType" type="string" stored="true" indexed="true" /> + <field name="location" type="location" stored="true" indexed="true" /> + + <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/> + + <!-- catch-all field --> + <field name="text" type="text_general" stored="false" indexed="true" multiValued="true"/> + + <!-- fields for index-anchor plugin --> + <field name="anchor" type="text_general" stored="true" indexed="true" + multiValued="true"/> + + <!-- fields for index-more plugin --> + <field name="type" type="string" stored="true" indexed="true" multiValued="true"/> + <field name="contentLength" type="string" stored="true" indexed="false"/> + <field name="lastModified" type="date" stored="true" indexed="false"/> + <field name="date" type="tdate" stored="true" indexed="true"/> + + <!-- fields for languageidentifier plugin --> + <field name="lang" type="string" stored="true" indexed="true"/> + + <!-- fields for subcollection plugin --> + <field name="subcollection" type="string" stored="true" indexed="true" multiValued="true"/> + + <!-- fields for feed plugin (tag is also used by microformats-reltag)--> + <field name="author" type="string" stored="true" indexed="true"/> + <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/> + <field name="feed" type="string" stored="true" indexed="true"/> + <field name="publishedDate" type="date" stored="true" indexed="true"/> + <field name="updatedDate" type="date" stored="true" indexed="true"/> + + <!-- fields for creativecommons plugin --> + <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/> + + <!-- fields for tld plugin --> + <field name="tld" type="string" stored="false" indexed="false"/> + + <!-- field containing segment's raw binary content if indexed with -addBinaryContent --> + <field name="binaryContent" type="binary" stored="true" indexed="false"/> + + </fields> + <uniqueKey>id</uniqueKey> + <defaultSearchField>text</defaultSearchField> + <solrQueryParser defaultOperator="OR"/> + + <!-- copyField commands copy one field to another at the time a document + is added to the index. It's used either to index the same field differently, + or to add multiple fields to the same field for easier/faster searching. --> + + <copyField source="content" dest="text"/> + <copyField source="url" dest="text"/> + <copyField source="title" dest="text"/> + <copyField source="anchor" dest="text"/> + <copyField source="author" dest="text"/> + <copyField source="latLon" dest="location"/> </schema> Modified: nutch/trunk/ivy/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/ivy/ivy.xml (original) +++ nutch/trunk/ivy/ivy.xml Tue Aug 18 21:19:07 2015 @@ -34,25 +34,19 @@ </publications> <dependencies> - <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" - conf="*->master" /> - <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" - conf="*->master" /> + <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" conf="*->master" /> + <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" conf="*->master" /> <dependency org="log4j" name="log4j" rev="1.2.15" conf="*->master" /> - <dependency org="commons-lang" name="commons-lang" rev="2.6" - conf="*->default" /> - <dependency org="commons-collections" name="commons-collections" - rev="3.1" conf="*->default" /> - <dependency org="commons-httpclient" name="commons-httpclient" - rev="3.1" conf="*->master" /> - <dependency org="commons-codec" name="commons-codec" rev="1.3" - conf="*->default" /> - <dependency org="org.apache.commons" name="commons-compress" rev="1.9" - conf="*->default" /> - <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0" - conf="*->default"> + <dependency org="commons-lang" name="commons-lang" rev="2.6" conf="*->default" /> + <dependency org="commons-collections" name="commons-collections" rev="3.1" conf="*->default" /> + <dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master" /> + <dependency org="commons-codec" name="commons-codec" rev="1.3" conf="*->default" /> + <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" /> + <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" /> + + <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0" conf="*->default"> <exclude org="hsqldb" name="hsqldb" /> <exclude org="net.sf.kosmosfs" name="kfs" /> <exclude org="net.java.dev.jets3t" name="jets3t" /> @@ -69,8 +63,9 @@ <dependency org="oro" name="oro" rev="2.0.8" /> <dependency org="com.google.guava" name="guava" rev="11.0.2" /> - <dependency org="com.google.code.crawler-commons" name="crawler-commons" - rev="0.5" /> + + <dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.6" /> + <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/> <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4"/> <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4"/> @@ -79,40 +74,20 @@ <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" /> <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" /> <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" /> - - <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" /> - - <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" /> - <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" /> - <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" /> - <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.3.0" /> - <!--Configuration: test --> - <!--artifacts needed for testing --> <dependency org="junit" name="junit" rev="4.11" conf="test->default" /> - - <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" - conf="test->default" /> - - <dependency org="org.mortbay.jetty" name="jetty-client" - rev="6.1.22" conf="test->default" /> - - <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" - conf="test->default" /> - <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" - conf="test->default" /> - - <!-- naive bayes parse filter --> - <dependency org="org.apache.mahout.commons" name="commons-cli" rev="2.0-mahout" - conf="test->default" /> + <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" /> + <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" /> + <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" /> + <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" /> + <!-- end of test artifacts --> <!--global exclusion --> <exclude module="jmxtools" /> <exclude module="jms" /> <exclude module="jmxri" /> - <exclude org="com.thoughtworks.xstream"/> - <exclude org="org.apache.mrunit"/> + <exclude org="com.thoughtworks.xstream"/> </dependencies> Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Tue Aug 18 21:19:07 2015 @@ -19,7 +19,6 @@ package org.apache.nutch.indexer; // Hadoop imports import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; // Nutch imports Copied: nutch/trunk/src/plugin/index-geoip/build-ivy.xml (from r1693938, nutch/trunk/src/plugin/parse-tika/build-ivy.xml) URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/build-ivy.xml?p2=nutch/trunk/src/plugin/index-geoip/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693938&r2=1696506&rev=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original) +++ nutch/trunk/src/plugin/index-geoip/build-ivy.xml Tue Aug 18 21:19:07 2015 @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> +<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> <property name="ivy.install.version" value="2.1.0" /> <condition property="ivy.home" value="${env.IVY_HOME}"> Modified: nutch/trunk/src/plugin/index-geoip/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-geoip/ivy.xml (original) +++ nutch/trunk/src/plugin/index-geoip/ivy.xml Tue Aug 18 21:19:07 2015 @@ -36,7 +36,11 @@ </publications> <dependencies> - <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.1.0" /> + <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" > + <!-- Exlude due to classpath issues --> + <exclude org="org.apache.httpcomponents" name="httpclient" /> + <exclude org="org.apache.httpcomponents" name="httpcore" /> + </dependency> </dependencies> </ivy-module> Modified: nutch/trunk/src/plugin/index-geoip/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-geoip/plugin.xml (original) +++ nutch/trunk/src/plugin/index-geoip/plugin.xml Tue Aug 18 21:19:07 2015 @@ -25,15 +25,13 @@ <library name="index-geoip.jar"> <export name="*"/> </library> - <library name="geoip2-2.1.0.jar"/> <library name="commons-codec-1.6.jar"/> <library name="commons-logging-1.1.1.jar"/> - <library name="google-http-client-1.19.0.jar"/> - <library name="httpclient-4.0.1.jar"/> - <library name="httpcore-4.0.1.jar"/> - <library name="jackson-annotations-2.4.0.jar"/> - <library name="jackson-core-2.4.3.jar"/> - <library name="jackson-databind-2.4.3.jar"/> + <library name="geoip2-2.3.1.jar"/> + <library name="google-http-client-1.20.0.jar"/> + <library name="jackson-annotations-2.5.0.jar"/> + <library name="jackson-core-2.5.3.jar"/> + <library name="jackson-databind-2.5.3.jar"/> <library name="jsr305-1.3.9.jar"/> <library name="maxmind-db-1.0.0.jar"/> </runtime> Added: nutch/trunk/src/plugin/indexer-solr/build-ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/build-ivy.xml?rev=1696506&view=auto ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/build-ivy.xml (added) +++ nutch/trunk/src/plugin/indexer-solr/build-ivy.xml Tue Aug 18 21:19:07 2015 @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="indexer-solr" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/> + </target> + +</project> Modified: nutch/trunk/src/plugin/indexer-solr/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/ivy.xml (original) +++ nutch/trunk/src/plugin/indexer-solr/ivy.xml Tue Aug 18 21:19:07 2015 @@ -36,8 +36,9 @@ </publications> <dependencies> - <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0" - conf="*->default"/> + <dependency org="org.apache.solr" name="solr-solrj" rev="4.10.2" conf="*->default"/> + <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.3.1" conf="*->default"/> + <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.3.1" conf="*->default"/> </dependencies> </ivy-module> Modified: nutch/trunk/src/plugin/indexer-solr/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/plugin.xml (original) +++ nutch/trunk/src/plugin/indexer-solr/plugin.xml Tue Aug 18 21:19:07 2015 @@ -15,29 +15,24 @@ See the License for the specific language governing permissions and limitations under the License. --> -<plugin id="indexer-solr" name="SOLRIndexWriter" version="1.0.0" +<plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0" provider-name="nutch.apache.org"> <runtime> <library name="indexer-solr.jar"> <export name="*" /> </library> - - <library name="activation-1.1.jar"/> - <library name="commons-codec-1.4.jar"/> - <library name="commons-httpclient-3.1.jar"/> - <library name="commons-io-1.4.jar"/> - <library name="commons-logging-1.1.1.jar"/> - <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/> - <library name="jline-0.9.1.jar"/> - <library name="log4j-1.2.15.jar"/> - <library name="lucene-core-3.4.0.jar"/> - <library name="mail-1.4.1.jar"/> - <library name="slf4j-api-1.6.1.jar"/> - <library name="solr-solrj-3.4.0.jar"/> - <library name="stax-api-1.0.1.jar"/> - <library name="wstx-asl-3.2.7.jar"/> - <library name="zookeeper-3.3.1.jar"/> + <library name="commons-codec-1.9.jar"/> + <library name="commons-io-2.3.jar"/> + <library name="commons-logging-1.1.3.jar"/> + <library name="httpclient-4.3.1.jar"/> + <library name="httpcore-4.3.jar"/> + <library name="httpmime-4.3.1.jar"/> + <library name="noggit-0.5.jar"/> + <library name="slf4j-api-1.7.6.jar"/> + <library name="solr-solrj-4.10.2.jar"/> + <library name="wstx-asl-3.2.7.jar"/> + <library name="zookeeper-3.4.6.jar"/> </runtime> <requires> @@ -45,9 +40,9 @@ </requires> <extension id="org.apache.nutch.indexer.solr" - name="SOLR Index Writer" + name="Solr Index Writer" point="org.apache.nutch.indexer.IndexWriter"> - <implementation id="SOLRIndexWriter" + <implementation id="SolrIndexWriter" class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" /> </extension> Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Tue Aug 18 21:19:07 2015 @@ -17,6 +17,7 @@ package org.apache.nutch.indexwriter.solr; public interface SolrConstants { + public static final String SOLR_PREFIX = "solr."; public static final String SERVER_URL = SOLR_PREFIX + "server.url"; @@ -30,7 +31,13 @@ public interface SolrConstants { public static final String USERNAME = SOLR_PREFIX + "auth.username"; public static final String PASSWORD = SOLR_PREFIX + "auth.password"; - + + public static final String SERVER_TYPE = SOLR_PREFIX + "server.type"; + + public static final String ZOOKEEPER_URL = SOLR_PREFIX + "zookeeper.url"; + + public static final String LOADBALANCE_URLS = SOLR_PREFIX + "loadbalance.urls"; + @Deprecated public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index"; Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Tue Aug 18 21:19:07 2015 @@ -55,7 +55,7 @@ public class SolrIndexWriter implements private boolean delete = false; public void open(JobConf job, String name) throws IOException { - SolrServer server = SolrUtils.getCommonsHttpSolrServer(job); + SolrServer server = SolrUtils.getSolrServer(job); init(server, job); } @@ -183,7 +183,7 @@ public class SolrIndexWriter implements config = conf; String serverURL = conf.get(SolrConstants.SERVER_URL); if (serverURL == null) { - String message = "Missing SOLR URL. Should be set via -D " + String message = "Missing Solr URL. Should be set via -D " + SolrConstants.SERVER_URL; message += "\n" + describe(); LOG.error(message); @@ -192,15 +192,20 @@ public class SolrIndexWriter implements } public String describe() { - StringBuffer sb = new StringBuffer("SOLRIndexWriter\n"); + StringBuffer sb = new StringBuffer("SolrIndexWriter\n"); + sb.append("\t").append(SolrConstants.SERVER_TYPE) + .append(" : Type of SolrServer to communicate with (default 'http' however options include 'cloud', 'lb' and 'concurrent')\n"); sb.append("\t").append(SolrConstants.SERVER_URL) - .append(" : URL of the SOLR instance (mandatory)\n"); - sb.append("\t").append(SolrConstants.COMMIT_SIZE) - .append(" : buffer size when sending to SOLR (default 1000)\n"); + .append(" : URL of the Solr instance (mandatory)\n"); + sb.append("\t").append(SolrConstants.ZOOKEEPER_URL) + .append(" : URL of the Zookeeper URL (mandatory if 'cloud' value for solr.server.type)\n"); + sb.append("\t").append(SolrConstants.LOADBALANCE_URLS) + .append(" : Comma-separated string of Solr server strings to be used (madatory if 'lb' value for solr.server.type)\n"); sb.append("\t") .append(SolrConstants.MAPPING_FILE) - .append( - " : name of the mapping file for fields (default solrindex-mapping.xml)\n"); + .append(" : name of the mapping file for fields (default solrindex-mapping.xml)\n"); + sb.append("\t").append(SolrConstants.COMMIT_SIZE) + .append(" : buffer size when sending to Solr (default 1000)\n"); sb.append("\t").append(SolrConstants.USE_AUTH) .append(" : use authentication (default false)\n"); sb.append("\t").append(SolrConstants.USERNAME) @@ -209,5 +214,4 @@ public class SolrIndexWriter implements .append(" : password for authentication\n"); return sb.toString(); } - } Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Tue Aug 18 21:19:07 2015 @@ -16,14 +16,20 @@ */ package org.apache.nutch.indexwriter.solr; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.auth.AuthScope; -import org.apache.commons.httpclient.UsernamePasswordCredentials; -import org.apache.commons.httpclient.params.HttpClientParams; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CredentialsProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.mapred.JobConf; -import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; +import org.apache.solr.client.solrj.impl.CloudSolrServer; +import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.impl.LBHttpSolrServer; +import org.apache.solr.client.solrj.SolrServer; import java.net.MalformedURLException; @@ -31,33 +37,62 @@ public class SolrUtils { public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class); - public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job) + private static SolrServer server; + + public static SolrServer getSolrServer(JobConf job) throws MalformedURLException { - HttpClient client = new HttpClient(); + boolean auth = job.getBoolean(SolrConstants.USE_AUTH, false); + + CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); // Check for username/password - if (job.getBoolean(SolrConstants.USE_AUTH, false)) { + if (auth) { String username = job.get(SolrConstants.USERNAME); - LOG.info("Authenticating as: " + username); - AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); - - client.getState().setCredentials( - scope, - new UsernamePasswordCredentials(username, job - .get(SolrConstants.PASSWORD))); - - HttpClientParams params = client.getParams(); - params.setAuthenticationPreemptive(true); - - client.setParams(params); + credentialsProvider.setCredentials(scope, + new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD))); } + CloseableHttpClient client = + HttpClientBuilder.create().setDefaultCredentialsProvider(credentialsProvider).build(); - String serverURL = job.get(SolrConstants.SERVER_URL); - - return new CommonsHttpSolrServer(serverURL, client); + String solrServer = job.get(SolrConstants.SERVER_TYPE, "http"); + String zkHost = job.get(SolrConstants.ZOOKEEPER_URL, null); + String solrServerUrl = job.get(SolrConstants.SERVER_URL); + + switch (solrServer) { + case "cloud": + server = new CloudSolrServer(zkHost); + LOG.debug("CloudSolrServer selected as indexing server."); + break; + case "concurrent": + server = new ConcurrentUpdateSolrServer(solrServerUrl, client, 1000, 10); + LOG.debug("ConcurrentUpdateSolrServer selected as indexing server."); + break; + case "http": + if (auth) { + server = new HttpSolrServer(solrServerUrl, client); + } else { + server = new HttpSolrServer(solrServerUrl); + } + LOG.debug("HttpSolrServer selected as indexing server."); + break; + case "lb": + String[] lbServerString = job.get(SolrConstants.LOADBALANCE_URLS).split(","); + server = new LBHttpSolrServer(client, lbServerString); + LOG.debug("LBHttpSolrServer selected as indexing server."); + break; + default: + if (auth) { + server = new HttpSolrServer(solrServerUrl, client); + } else { + server = new HttpSolrServer(solrServerUrl); + } + LOG.debug("HttpSolrServer selected as indexing server."); + break; + } + return server; } public static String stripNonCharCodepoints(String input) { @@ -82,4 +117,4 @@ public class SolrUtils { return retval.toString(); } -} +} \ No newline at end of file Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/ivy.xml (original) +++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Aug 18 21:19:07 2015 @@ -38,6 +38,8 @@ <dependencies> <dependency org="org.apache.tika" name="tika-parsers" rev="1.8" conf="*->default"> <exclude org="org.apache.tika" name="tika-core" /> + <exclude org="org.apache.httpcomponents" name="httpclient" /> + <exclude org="org.apache.httpcomponents" name="httpcore" /> </dependency> <override module="rome" rev="0.9"/> </dependencies> Copied: nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml (from r1693468, nutch/trunk/src/plugin/parse-tika/build-ivy.xml) URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml?p2=nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693468&r2=1696506&rev=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original) +++ nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml Tue Aug 18 21:19:07 2015 @@ -15,7 +15,7 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> +<project name="parsefilter-naivebayes" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> <property name="ivy.install.version" value="2.1.0" /> <condition property="ivy.home" value="${env.IVY_HOME}"> Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml (original) +++ nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml Tue Aug 18 21:19:07 2015 @@ -36,6 +36,14 @@ </publications> <dependencies> + + <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" /> + <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" > + <exclude org="org.apache.mrunit" name="mrunit"/> + </dependency> + <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" /> + <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" /> + </dependencies> </ivy-module> Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml (original) +++ nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml Tue Aug 18 21:19:07 2015 @@ -25,10 +25,22 @@ <library name="parsefilter-naivebayes.jar"> <export name="*"/> </library> - <library name="lucene-analyzers-common-4.3.0.jar"/> - <library name="mahout-math-0.8.jar"/> - <library name="mahout-core-0.8.jar"/> - <library name="lucene-core-4.3.0.jar"/> + <library name="commons-cli-2.0-mahout.jar"/> + <library name="commons-lang3-3.1.jar"/> + <library name="commons-math3-3.2.jar"/> + <library name="guava-14.0.1.jar"/> + <library name="jackson-core-asl-1.9.12.jar"/> + <library name="jackson-mapper-asl-1.9.12.jar"/> + <library name="lucene-analyzers-common-4.10.2.jar"/> + <library name="lucene-core-4.10.2.jar"/> + <library name="mahout-core-0.9.jar"/> + <library name="mahout-math-0.10.1.jar"/> + <library name="slf4j-api-1.7.12.jar"/> + <library name="solr-commons-csv-3.5.0.jar"/> + <library name="t-digest-3.1.jar"/> + <library name="xmlpull-1.1.3.1.jar"/> + <library name="xpp3_min-1.1.4c.jar"/> + <library name="xstream-1.4.4.jar"/> </runtime> <requires> @@ -36,10 +48,9 @@ </requires> <extension id="org.apache.nutch.htmlparsefilter.naivebayes" - name="Nutch Parser Filter" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="NaiveBayesHTMLParseFilter" - class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/> + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="NaiveBayesHTMLParseFilter" + class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/> </extension> </plugin> Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1696506&r1=1696505&r2=1696506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java (original) +++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java Tue Aug 18 21:19:07 2015 @@ -35,7 +35,6 @@ import org.apache.lucene.analysis.Analyz import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.Version; import org.apache.mahout.classifier.naivebayes.BayesUtils; import org.apache.mahout.classifier.naivebayes.NaiveBayesModel; import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier; @@ -47,17 +46,12 @@ import org.apache.mahout.math.Vector; import org.apache.mahout.math.Vector.Element; import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles; import org.apache.mahout.vectorizer.TFIDF; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.google.common.collect.ConcurrentHashMultiset; import com.google.common.collect.Multiset; public class NaiveBayesClassifier { private static NaiveBayesModel model = null; - private static final Logger LOG = LoggerFactory - .getLogger(NaiveBayesClassifier.class); public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) { @@ -114,7 +108,7 @@ public class NaiveBayesClassifier { public static String classify(String text, String modelPath, String labelIndexPath, String dictionaryPath, String documentFrequencyPath) - throws IOException { + throws IOException { Configuration configuration = new Configuration(); @@ -134,7 +128,7 @@ public class NaiveBayesClassifier { new Path(documentFrequencyPath)); // analyzer used to extract word from text - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + Analyzer analyzer = new StandardAnalyzer(); // int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue();