Author: lewismc
Date: Tue Aug 18 21:19:07 2015
New Revision: 1696506

URL: http://svn.apache.org/r1696506
Log:
NUTCH-1486 Upgrade to Solr 4.10.2

Added:
    nutch/trunk/src/plugin/index-geoip/build-ivy.xml
      - copied, changed from r1693938, 
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
    nutch/trunk/src/plugin/indexer-solr/build-ivy.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml
      - copied, changed from r1693468, 
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
Removed:
    nutch/trunk/conf/schema-solr4.xml
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/log4j.properties
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/conf/schema.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
    nutch/trunk/src/plugin/index-geoip/ivy.xml
    nutch/trunk/src/plugin/index-geoip/plugin.xml
    nutch/trunk/src/plugin/indexer-solr/ivy.xml
    nutch/trunk/src/plugin/indexer-solr/plugin.xml
    
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
    
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
    
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
    nutch/trunk/src/plugin/parse-tika/ivy.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
    
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Aug 18 21:19:07 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)
+
 * NUTCH-2048 parse-tika: fix dependencies in plugin.xml (Michael Joyce via 
snagel)
 
 * NUTCH-2066 Parameterize Generate REST endpoint (Sujen Shah via mattmann)

Modified: nutch/trunk/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Tue Aug 18 21:19:07 2015
@@ -44,10 +44,8 @@ log4j.logger.org.apache.nutch.crawl.Craw
 log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexer=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrDeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrClean=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexwriter.solr.SolrIndexWriter=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexwriter.solr.SolrUtils-INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Aug 18 21:19:07 2015
@@ -1615,6 +1615,18 @@ CAUTION: Set the parser.timeout to -1 or
 </property>
 
 <!-- solr index properties -->
+
+<property>
+  <name>solr.server.type</name>
+  <value>http</value>
+  <description>
+    Specifies the SolrServer implementation to use. This is a string value
+    of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
+    The values represent CloudSolrServer, ConcurrentUpdateSolrServer, 
+    HttpSolrServer or LBHttpSolrServer respectively.
+  </description>
+</property>
+
 <property>
   <name>solr.server.url</name>
   <value>http://127.0.0.1:8983/solr/</value>
@@ -1624,6 +1636,25 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<property>
+  <name>solr.zookeeper.url</name>
+  <value></value>
+  <description>
+      Defines the Zookeeper URL which is an essential setting to be used 
+      when using SolrCloud. This should be a fully qualified URL similar to
+      the property provided within 'solr.server.url' above.
+  </description>
+</property>
+
+<property>
+  <name>solr.loadbalance.urls</name>
+  <value></value>
+  <description>
+      A comma-seperated value representing the Solr servers to be used when
+      initiating LBHttpSolrServer as the SolrServer implementation. 
+  </description>
+</property>
+
 <property>
   <name>solr.mapping.file</name>
   <value>solrindex-mapping.xml</value>

Modified: nutch/trunk/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Tue Aug 18 21:19:07 2015
@@ -1,72 +1,308 @@
 <?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+    Description: This document contains Solr 4.x schema definition to
+    be used with Solr integration currently built into Nutch.
+    This schema is not minimal, there are some useful field type definitions 
left,
+    and the set of fields and their flags (indexed/stored/term vectors) can be
+    further optimized depending on needs.  See
+    
http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/schema.xml?view=markup
+    for more info.
+-->
+
+<schema name="nutch" version="1.5">
+
+  <types>
+
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" 
omitNorms="true"/>
+
+    <fieldtype name="binary" class="solr.BinaryField"/>
+
+
     <!--
-        Licensed to the Apache Software Foundation (ASF) under one or
-        more contributor license agreements. See the NOTICE file
-        distributed with this work for additional information regarding
-        copyright ownership. The ASF licenses this file to You under the
-        Apache License, Version 2.0 (the "License"); you may not use
-        this file except in compliance with the License. You may obtain
-        a copy of the License at
-        http://www.apache.org/licenses/LICENSE-2.0 Unless required by
-        applicable law or agreed to in writing, software distributed
-        under the License is distributed on an "AS IS" BASIS, WITHOUT
-        WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-        See the License for the specific language governing permissions
-        and limitations under the License.
+      Default numeric field types. For faster range queries, consider the 
tint/tfloat/tlong/tdouble types.
     -->
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+
     <!--
-        Description: This document contains Solr 3.1 schema definition to
-        be used with Solr integration currently build into Nutch. See
-        https://issues.apache.org/jira/browse/NUTCH-442
-        https://issues.apache.org/jira/browse/NUTCH-699
-        https://issues.apache.org/jira/browse/NUTCH-994
-        https://issues.apache.org/jira/browse/NUTCH-997
-        https://issues.apache.org/jira/browse/NUTCH-1058
-        https://issues.apache.org/jira/browse/NUTCH-1232
-        and
-        http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/
-        example/solr/conf/schema.xml?view=markup
-        for more info.
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
     -->
-<schema name="nutch" version="1.5">
-    <types>
-        <fieldType name="string" class="solr.StrField" sortMissingLast="true"
-            omitNorms="true"/> 
-        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
-            omitNorms="true" positionIncrementGap="0"/>
-        <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
-        <fieldtype name="binary" class="solr.BinaryField"/>
-
-        <fieldType name="text" class="solr.TextField"
-            positionIncrementGap="100">
-            <analyzer>
-                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-                <filter class="solr.StopFilterFactory"
-                    ignoreCase="true" words="stopwords.txt"/>
-                <filter class="solr.WordDelimiterFilterFactory"
-                    generateWordParts="1" generateNumberParts="1"
-                    catenateWords="1" catenateNumbers="1" catenateAll="0"
-                    splitOnCaseChange="1"/>
-                <filter class="solr.LowerCaseFilterFactory"/>
-                <filter class="solr.EnglishPorterFilterFactory"
-                    protected="protwords.txt"/>
-                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-            </analyzer>
-        </fieldType>
-        <fieldType name="url" class="solr.TextField"
-            positionIncrementGap="100">
-            <analyzer>
-                <tokenizer class="solr.StandardTokenizerFactory"/>
-                <filter class="solr.LowerCaseFilterFactory"/>
-                <filter class="solr.WordDelimiterFilterFactory"
-                    generateWordParts="1" generateNumberParts="1"/>
-            </analyzer>
-        </fieldType>
-            <!-- boolean type: "true" or "false" -->
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, 
and
+         is a more restricted form of the canonical representation of dateTime
+         http://www.w3.org/TR/xmlschema-2/#dateTime    
+         The trailing "Z" designates UTC time and is mandatory.
+         Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+         All other components are mandatory.
+
+         Expressions can also be used to denote calculations that should be
+         performed relative to "NOW" to determine the value, ie...
+
+               NOW/HOUR
+                  ... Round to the start of the current hour
+               NOW-1DAY
+                  ... Exactly 1 day prior to now
+               NOW/DAY+6MONTHS+3DAYS
+                  ... 6 months and 3 days in the future from the start of
+                      the current day
+                      
+         Consult the DateField javadocs for more information.
+
+         Note: For faster range queries, consider the tdate type
+      -->
+    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" 
precisionStep="0" positionIncrementGap="0"/>
+    
+    <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
+    
+    <!-- A Trie based date field for faster date range queries and date 
faceting. -->
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" 
precisionStep="6" positionIncrementGap="0"/>
+
+
+    <!-- solr.TextField allows the specification of custom text analyzers
+         specified as a tokenizer and a list of token filters. Different
+         analyzers may be specified for indexing and querying.
+
+         The optional positionIncrementGap puts space between multiple fields 
of
+         this type on the same document, with the purpose of preventing false 
phrase
+         matching across fields.
+
+         For more info on customizing your analyzer chain, please see
+         http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
+     -->
+
+    <!-- A general text field that has reasonable, generic
+         cross-language defaults: it tokenizes with StandardTokenizer,
+        removes stop words from case-insensitive "stopwords.txt"
+        (empty by default), and down cases.  At query time only, it
+        also applies synonyms. -->
+    <fieldType name="text_general" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English: it
+         tokenizes with StandardTokenizer, removes English stop words
+         (stopwords.txt), down cases, protects words from protwords.txt, and
+         finally applies Porter's stemming.  The query time analyzer
+         also applies synonyms from synonyms.txt. -->
+    <fieldType name="text_en" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+       <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+       <!-- Optionally you may want to use this less aggressive stemmer 
instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+       -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+       <filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+       <!-- Optionally you may want to use this less aggressive stemmer 
instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+       -->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English, plus
+        aggressive word-splitting and autophrase features enabled.
+        This field is just like text_en, except it adds
+        WordDelimiterFilter to enable splitting and matching of
+        words on case-change, alpha numeric boundaries, and
+        non-alphanumeric chars.  This means certain compound word
+        cases will work, for example query "wi fi" will match
+        document "WiFi" or "wi-fi".  However, other cases will still
+        not match, for example if the query is "wifi" and the
+        document is "wi fi" or if the query is "wi-fi" and the
+        document is "wifi".
+        -->
+    <fieldType name="text_en_splitting" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" 
synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Less flexible matching, but less false matches.  Probably not ideal 
for product names,
+         but may be good for SKUs.  Can insert dashes in the wrong place and 
still match. -->
+    <fieldType name="text_en_splitting_tight" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="false"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" 
generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+        <!-- this filter can remove any duplicate tokens that appear at the 
same position - sometimes
+             possible with WordDelimiterFilter in conjuncton with stemming. -->
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- Just like text_general except it reverses the characters of
+        each token, to enable more efficient leading wildcard queries. -->
+    <fieldType name="text_general_rev" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <fieldtype name="phonetic" stored="false" indexed="true" 
class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype>
+
+    <fieldtype name="payloads" stored="false" indexed="true" 
class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <!--
+        The DelimitedPayloadTokenFilter can put payloads on tokens... for 
example,
+        a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
+        Attributes of the DelimitedPayloadTokenFilterFactory : 
+         "delimiter" - a one character delimiter. Default is | (pipe)
+        "encoder" - how to encode the following value into a playload
+           float -> org.apache.lucene.analysis.payloads.FloatEncoder,
+           integer -> o.a.l.a.p.IntegerEncoder
+           identity -> o.a.l.a.p.IdentityEncoder
+            Fully Qualified class name implementing PayloadEncoder, Encoder 
must have a no arg constructor.
+         -->
+        <filter class="solr.DelimitedPayloadTokenFilterFactory" 
encoder="float"/>
+      </analyzer>
+    </fieldtype>
+
+    <!-- lowercases the entire field value, keeping it as a single token.  -->
+    <fieldType name="lowercase" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+      </analyzer>
+    </fieldType>
+
+    <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+           <filter class="solr.LowerCaseFilterFactory"/>
+           <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1"/>
+      </analyzer>
+    </fieldType>
+
+
+    <fieldType name="text_path" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" 
multiValued="true" class="solr.StrField" />
+
+        <!-- boolean type: "true" or "false" -->
         <fieldType name="boolean" class="solr.BoolField" 
sortMissingLast="true"/>
 
          <!-- sortMissingLast and sortMissingFirst attributes are optional 
attributes are
@@ -83,120 +319,112 @@
        - If sortMissingLast="false" and sortMissingFirst="false" (the default),
          then default lucene sorting will be used which places docs without the
          field first in an ascending sort and last in a descending sort.
-         -->  
-    </types>
-    <fields>
-        <field name="id" type="string" stored="true" indexed="true"
-            required="true"/>
-
-        <field name="text" type="text" stored="false" indexed="true" 
multiValued="true"/>
-
-        <!-- core fields -->
-        <field name="segment" type="string" stored="true" indexed="false"/>
-        <field name="digest" type="string" stored="true" indexed="false"/>
-        <field name="boost" type="float" stored="true" indexed="false"/>
-
-        <!-- fields for index-basic plugin -->
-        <field name="host" type="string" stored="false" indexed="true"/>
-        <field name="url" type="url" stored="true" indexed="true"/>
-        <field name="content" type="text" stored="false" indexed="true"/>
-        <field name="title" type="text" stored="true" indexed="true"/>
-        <field name="cache" type="string" stored="true" indexed="false"/>
-        <field name="tstamp" type="date" stored="true" indexed="false"/>
-        
-        <!-- fields for index-geoip plugin -->
-        <field name="ip" type="string" stored="true" indexed="true" />
-        <field name="cityName" type="string" stored="true" indexed="true" />
-        <field name="cityConfidence" type="int" stored="true" indexed="true" />
-        <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
-        <field name="continentCode" type="string" stored="true" indexed="true" 
/>
-        <field name="continentGeoNameId" type="int" stored="true" 
indexed="true" />
-        <field name="contentName" type="string" stored="true" indexed="true" />
-        <field name="countryIsoCode" type="string" stored="true" 
indexed="true"/>
-        <field name="countryName" type="string" stored="true" indexed="true" />
-        <field name="countryConfidence" type="int" stored="true" 
indexed="true"/>
-        <field name="countryGeoNameId" type="int" stored="true" 
indexed="true"/>        
-        <field name="latLon" type="string" stored="true" indexed="true"/>
-        <field name="accRadius" type="int" stored="true" indexed="true"/>
-        <field name="timeZone" type="string" stored="true" indexed="true"/>
-        <field name="metroCode" type="int" stored="true" indexed="true" />
-        <field name="postalCode" type="string" stored="true" indexed="true" />
-        <field name="postalConfidence" type="int" stored="true" indexed="true" 
/>
-        <field name="countryType" type="string" stored="true" indexed="true" />
-        <field name="subDivName" type="string" stored="true" indexed="true" />
-        <field name="subDivIsoCode" type="string" stored="true" indexed="true" 
/>
-        <field name="subDivConfidence" type="int" stored="true" indexed="true" 
/>
-        <field name="subDivGeoNameId" type="int" stored="true" indexed="true" 
/>
-        <field name="autonSystemNum" type="int" stored="true" indexed="true" />
-        <field name="autonSystemOrg" type="string" stored="true" 
indexed="true" />
-        <field name="domain" type="string" stored="true" indexed="true" />
-        <field name="isp" type="string" stored="true" indexed="true" />
-        <field name="org" type="string" stored="true" indexed="true" />
-        <field name="userType" type="string" stored="true" indexed="true" />
-        <field name="isAnonProxy" type="boolean" stored="true" indexed="true" 
/>
-        <field name="isSatelitteProv" type="boolean" stored="true" 
indexed="true" />
-        <field name="connType" type="string" stored="true" indexed="true" />
-        
-
-        
-        <dynamicField name="*_coordinate" type="tdouble" indexed="true" 
stored="false"/>
-
-        <!-- fields for index-anchor plugin -->
-        <field name="anchor" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-
-        <!-- fields for index-more plugin -->
-        <field name="type" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-        <field name="contentLength" type="long" stored="true"
-            indexed="false"/>
-        <field name="lastModified" type="date" stored="true"
-            indexed="false"/>
-        <field name="date" type="date" stored="true" indexed="true"/>
-
-        <!-- fields for languageidentifier plugin -->
-        <field name="lang" type="string" stored="true" indexed="true"/>
-
-        <!-- fields for subcollection plugin -->
-        <field name="subcollection" type="string" stored="true"
-            indexed="true" multiValued="true"/>
-
-        <!-- fields for feed plugin (tag is also used by 
microformats-reltag)-->
-        <field name="author" type="string" stored="true" indexed="true"/>
-        <field name="tag" type="string" stored="true" indexed="true" 
multiValued="true"/>
-        <field name="feed" type="string" stored="true" indexed="true"/>
-        <field name="publishedDate" type="date" stored="true"
-            indexed="true"/>
-        <field name="updatedDate" type="date" stored="true"
-            indexed="true"/>
-
-        <!-- fields for creativecommons plugin -->
-        <field name="cc" type="string" stored="true" indexed="true"
-            multiValued="true"/>
-            
-        <!-- fields for tld plugin -->    
-        <field name="tld" type="string" stored="false" indexed="false"/>
-
-        <!-- field containing segment's raw binary content if indexed with 
-addBinaryContent -->
-+       <field name="binaryContent" type="binary" stored="true" 
indexed="false"/>
-
-        <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler -->
-        <field name="_version_" type="long" indexed="true" stored="true"/>
-
-    </fields>
-    <uniqueKey>id</uniqueKey>
-    <defaultSearchField>content</defaultSearchField>
-    <solrQueryParser defaultOperator="OR"/>
-
-    <!-- copyField commands copy one field to another at the time a document
-       is added to the index. It's used either to index the same field 
differently,
-       or to add multiple fields to the same field for easier/faster 
searching. 
-    -->
-    <copyField source="content" dest="text"/>
-    <copyField source="url" dest="text"/>
-    <copyField source="title" dest="text"/>
-    <copyField source="anchor" dest="text"/>
-    <copyField source="author" dest="text"/>
-    <copyField source="latLon" dest="location"/>
+         -->
+
+ </types>
 
+ <fields>
+    <field name="id" type="string" stored="true" indexed="true" 
required="true"/>
+    <field name="_version_" type="long" indexed="true" stored="true"/>
+
+    <!-- core fields -->
+    <field name="segment" type="string" stored="true" indexed="false"/>
+    <field name="digest" type="string" stored="true" indexed="false"/>
+    <field name="boost" type="float" stored="true" indexed="false"/>
+
+    <!-- fields for index-basic plugin -->
+    <field name="host" type="url" stored="false" indexed="true"/>
+    <field name="url" type="url" stored="true" indexed="true"/>
+    <!-- stored=true for highlighting, use term vectors  and positions for 
fast highlighting -->
+    <field name="content" type="text_general" stored="true" indexed="true"/>
+    <field name="title" type="text_general" stored="true" indexed="true"/>
+    <field name="cache" type="string" stored="true" indexed="false"/>
+    <field name="tstamp" type="date" stored="true" indexed="false"/>
+
+    <!-- fields for index-geoip plugin -->
+    <field name="ip" type="string" stored="true" indexed="true" />
+    <field name="cityName" type="string" stored="true" indexed="true" />
+    <field name="cityConfidence" type="int" stored="true" indexed="true" />
+    <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+    <field name="continentCode" type="string" stored="true" indexed="true" />
+    <field name="continentGeoNameId" type="int" stored="true" indexed="true" />
+    <field name="contentName" type="string" stored="true" indexed="true" />
+    <field name="countryIsoCode" type="string" stored="true" indexed="true"/>
+    <field name="countryName" type="string" stored="true" indexed="true" />
+    <field name="countryConfidence" type="int" stored="true" indexed="true"/>
+    <field name="countryGeoNameId" type="int" stored="true" indexed="true"/>
+    <field name="latLon" type="string" stored="true" indexed="true"/>
+    <field name="accRadius" type="int" stored="true" indexed="true"/>
+    <field name="timeZone" type="string" stored="true" indexed="true"/>
+    <field name="metroCode" type="int" stored="true" indexed="true" />
+    <field name="postalCode" type="string" stored="true" indexed="true" />
+    <field name="postalConfidence" type="int" stored="true" indexed="true" />
+    <field name="countryType" type="string" stored="true" indexed="true" />
+    <field name="subDivName" type="string" stored="true" indexed="true" />
+    <field name="subDivIsoCode" type="string" stored="true" indexed="true" />
+    <field name="subDivConfidence" type="int" stored="true" indexed="true" />
+    <field name="subDivGeoNameId" type="int" stored="true" indexed="true" /> 
+    <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+    <field name="autonSystemOrg" type="string" stored="true" indexed="true" />
+    <field name="domain" type="string" stored="true" indexed="true" />
+    <field name="isp" type="string" stored="true" indexed="true" />
+    <field name="org" type="string" stored="true" indexed="true" />
+    <field name="userType" type="string" stored="true" indexed="true" />
+    <field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
+    <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" 
/>
+    <field name="connType" type="string" stored="true" indexed="true" />
+    <field name="location" type="location" stored="true" indexed="true" />
+
+    <dynamicField name="*_coordinate" type="tdouble" indexed="true" 
stored="false"/>
+
+    <!-- catch-all field -->
+    <field name="text" type="text_general" stored="false" indexed="true" 
multiValued="true"/>
+
+    <!-- fields for index-anchor plugin -->
+    <field name="anchor" type="text_general" stored="true" indexed="true"
+        multiValued="true"/>
+
+    <!-- fields for index-more plugin -->
+    <field name="type" type="string" stored="true" indexed="true" 
multiValued="true"/>
+    <field name="contentLength" type="string" stored="true" indexed="false"/>
+    <field name="lastModified" type="date" stored="true" indexed="false"/>
+    <field name="date" type="tdate" stored="true" indexed="true"/>
+
+    <!-- fields for languageidentifier plugin -->
+    <field name="lang" type="string" stored="true" indexed="true"/>
+
+    <!-- fields for subcollection plugin -->
+    <field name="subcollection" type="string" stored="true" indexed="true" 
multiValued="true"/>
+
+    <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
+    <field name="author" type="string" stored="true" indexed="true"/>
+    <field name="tag" type="string" stored="true" indexed="true" 
multiValued="true"/>
+    <field name="feed" type="string" stored="true" indexed="true"/>
+    <field name="publishedDate" type="date" stored="true" indexed="true"/>
+    <field name="updatedDate" type="date" stored="true" indexed="true"/>
+
+    <!-- fields for creativecommons plugin -->
+    <field name="cc" type="string" stored="true" indexed="true" 
multiValued="true"/>
+
+    <!-- fields for tld plugin -->    
+    <field name="tld" type="string" stored="false" indexed="false"/>
+
+    <!-- field containing segment's raw binary content if indexed with 
-addBinaryContent -->
+    <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
+ </fields>
+ <uniqueKey>id</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="OR"/>
+
+  <!-- copyField commands copy one field to another at the time a document
+        is added to the index.  It's used either to index the same field 
differently,
+        or to add multiple fields to the same field for easier/faster 
searching.  -->
+
+ <copyField source="content" dest="text"/>
+ <copyField source="url" dest="text"/>
+ <copyField source="title" dest="text"/>
+ <copyField source="anchor" dest="text"/>
+ <copyField source="author" dest="text"/>
+ <copyField source="latLon" dest="location"/>
 </schema>

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Aug 18 21:19:07 2015
@@ -34,25 +34,19 @@
        </publications>
        
        <dependencies>
-               <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
-                       conf="*->master" />
-               <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
-                       conf="*->master" />
+               <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" 
conf="*->master" />
+               <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" 
conf="*->master" />
                
                <dependency org="log4j" name="log4j" rev="1.2.15" 
conf="*->master" />
                
-               <dependency org="commons-lang" name="commons-lang" rev="2.6"
-                       conf="*->default" />
-               <dependency org="commons-collections" name="commons-collections"
-                       rev="3.1" conf="*->default" />
-               <dependency org="commons-httpclient" name="commons-httpclient"
-                       rev="3.1" conf="*->master" />
-               <dependency org="commons-codec" name="commons-codec" rev="1.3"
-                       conf="*->default" />
-                <dependency org="org.apache.commons" name="commons-compress" 
rev="1.9" 
-                        conf="*->default" />   
-               <dependency org="org.apache.hadoop" name="hadoop-core" 
rev="1.2.0"
-                       conf="*->default">
+               <dependency org="commons-lang" name="commons-lang" rev="2.6" 
conf="*->default" />
+               <dependency org="commons-collections" 
name="commons-collections" rev="3.1" conf="*->default" />
+               <dependency org="commons-httpclient" name="commons-httpclient" 
rev="3.1" conf="*->master" />
+               <dependency org="commons-codec" name="commons-codec" rev="1.3" 
conf="*->default" />
+        <dependency org="org.apache.commons" name="commons-compress" rev="1.9" 
conf="*->default" />    
+        <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
+
+               <dependency org="org.apache.hadoop" name="hadoop-core" 
rev="1.2.0" conf="*->default">
                        <exclude org="hsqldb" name="hsqldb" />
                        <exclude org="net.sf.kosmosfs" name="kfs" />
                        <exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -69,8 +63,9 @@
                <dependency org="oro" name="oro" rev="2.0.8" />
 
                <dependency org="com.google.guava" name="guava" rev="11.0.2" />
-               <dependency org="com.google.code.crawler-commons" 
name="crawler-commons"
-                       rev="0.5" />
+
+               <dependency org="com.github.crawler-commons" 
name="crawler-commons" rev="0.6" />
+               
         <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
         <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" 
rev="3.0.4"/>
         <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" 
rev="3.0.4"/>
@@ -79,40 +74,20 @@
         <dependency org="com.fasterxml.jackson.core" name="jackson-databind" 
rev="2.5.1" /> 
         <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
         <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" />        
-
-        <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
-
-        <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
-        <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
-        <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
-        <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="4.3.0" />
               
-               <!--Configuration: test -->
-
                <!--artifacts needed for testing -->
                <dependency org="junit" name="junit" rev="4.11" 
conf="test->default" />
-
-               <dependency org="org.apache.hadoop" name="hadoop-test" 
rev="1.2.0"
-                       conf="test->default" />
-
-               <dependency org="org.mortbay.jetty" name="jetty-client"
-                       rev="6.1.22" conf="test->default" />
-
-               <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22"
-                       conf="test->default" />
-               <dependency org="org.mortbay.jetty" name="jetty-util" 
rev="6.1.22"
-                       conf="test->default" />
-
-                <!-- naive bayes parse filter -->
-                <dependency org="org.apache.mahout.commons" name="commons-cli" 
rev="2.0-mahout"
-                        conf="test->default" />
+               <dependency org="org.apache.hadoop" name="hadoop-test" 
rev="1.2.0" conf="test->default" />
+               <dependency org="org.mortbay.jetty" name="jetty-client" 
rev="6.1.22" conf="test->default" />
+               <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" 
conf="test->default" />
+               <dependency org="org.mortbay.jetty" name="jetty-util" 
rev="6.1.22" conf="test->default" />
+               <!-- end of test artifacts -->
 
                <!--global exclusion -->
                <exclude module="jmxtools" />
                <exclude module="jms" />
                <exclude module="jmxri" />
-                <exclude org="com.thoughtworks.xstream"/>
-                <exclude org="org.apache.mrunit"/>
+        <exclude org="com.thoughtworks.xstream"/>
 
        </dependencies>
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Tue Aug 
18 21:19:07 2015
@@ -19,7 +19,6 @@ package org.apache.nutch.indexer;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
 // Nutch imports

Copied: nutch/trunk/src/plugin/index-geoip/build-ivy.xml (from r1693938, 
nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/build-ivy.xml?p2=nutch/trunk/src/plugin/index-geoip/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693938&r2=1696506&rev=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/build-ivy.xml Tue Aug 18 21:19:07 2015
@@ -15,7 +15,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<project name="parse-tika" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="index-geoip" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
 
     <property name="ivy.install.version" value="2.1.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">

Modified: nutch/trunk/src/plugin/index-geoip/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/ivy.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/ivy.xml Tue Aug 18 21:19:07 2015
@@ -36,7 +36,11 @@
   </publications>
 
   <dependencies>
-    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.1.0" />
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
+      <!-- Exlude due to classpath issues -->
+      <exclude org="org.apache.httpcomponents" name="httpclient" />
+      <exclude org="org.apache.httpcomponents" name="httpcore" />
+    </dependency>
   </dependencies>
   
 </ivy-module>

Modified: nutch/trunk/src/plugin/index-geoip/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/plugin.xml (original)
+++ nutch/trunk/src/plugin/index-geoip/plugin.xml Tue Aug 18 21:19:07 2015
@@ -25,15 +25,13 @@
       <library name="index-geoip.jar">
          <export name="*"/>
       </library>
-      <library name="geoip2-2.1.0.jar"/>
       <library name="commons-codec-1.6.jar"/>
       <library name="commons-logging-1.1.1.jar"/>
-      <library name="google-http-client-1.19.0.jar"/>
-      <library name="httpclient-4.0.1.jar"/>
-      <library name="httpcore-4.0.1.jar"/>
-      <library name="jackson-annotations-2.4.0.jar"/>
-      <library name="jackson-core-2.4.3.jar"/>
-      <library name="jackson-databind-2.4.3.jar"/>
+      <library name="geoip2-2.3.1.jar"/>
+      <library name="google-http-client-1.20.0.jar"/>
+      <library name="jackson-annotations-2.5.0.jar"/>
+      <library name="jackson-core-2.5.3.jar"/>
+      <library name="jackson-databind-2.5.3.jar"/>
       <library name="jsr305-1.3.9.jar"/>
       <library name="maxmind-db-1.0.0.jar"/>
    </runtime>

Added: nutch/trunk/src/plugin/indexer-solr/build-ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/build-ivy.xml?rev=1696506&view=auto
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/indexer-solr/build-ivy.xml Tue Aug 18 21:19:07 2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Modified: nutch/trunk/src/plugin/indexer-solr/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/ivy.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/ivy.xml Tue Aug 18 21:19:07 2015
@@ -36,8 +36,9 @@
   </publications>
 
   <dependencies>
-   <dependency org="org.apache.solr" name="solr-solrj" rev="3.4.0"
-               conf="*->default"/>
+    <dependency org="org.apache.solr" name="solr-solrj" rev="4.10.2" 
conf="*->default"/>
+    <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.3.1"  
conf="*->default"/>
+   <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.3.1"  
conf="*->default"/>
   </dependencies>
   
 </ivy-module>

Modified: nutch/trunk/src/plugin/indexer-solr/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/indexer-solr/plugin.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/plugin.xml Tue Aug 18 21:19:07 2015
@@ -15,29 +15,24 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 -->
-<plugin id="indexer-solr" name="SOLRIndexWriter" version="1.0.0"
+<plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0"
   provider-name="nutch.apache.org">
 
   <runtime>
     <library name="indexer-solr.jar">
       <export name="*" />
     </library>
-
-     <library name="activation-1.1.jar"/>
-     <library name="commons-codec-1.4.jar"/>
-     <library name="commons-httpclient-3.1.jar"/>
-     <library name="commons-io-1.4.jar"/>
-     <library name="commons-logging-1.1.1.jar"/>
-     <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
-     <library name="jline-0.9.1.jar"/>
-     <library name="log4j-1.2.15.jar"/>
-     <library name="lucene-core-3.4.0.jar"/>
-     <library name="mail-1.4.1.jar"/>
-     <library name="slf4j-api-1.6.1.jar"/>
-     <library name="solr-solrj-3.4.0.jar"/>
-     <library name="stax-api-1.0.1.jar"/>
-     <library name="wstx-asl-3.2.7.jar"/>
-     <library name="zookeeper-3.3.1.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-io-2.3.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="httpclient-4.3.1.jar"/>
+      <library name="httpcore-4.3.jar"/>
+      <library name="httpmime-4.3.1.jar"/>
+      <library name="noggit-0.5.jar"/>
+      <library name="slf4j-api-1.7.6.jar"/>
+      <library name="solr-solrj-4.10.2.jar"/>
+      <library name="wstx-asl-3.2.7.jar"/>
+      <library name="zookeeper-3.4.6.jar"/>
   </runtime>
 
   <requires>
@@ -45,9 +40,9 @@
   </requires>
 
   <extension id="org.apache.nutch.indexer.solr"
-    name="SOLR Index Writer"
+    name="Solr Index Writer"
     point="org.apache.nutch.indexer.IndexWriter">
-    <implementation id="SOLRIndexWriter"
+    <implementation id="SolrIndexWriter"
       class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" />
   </extension>
 

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 Tue Aug 18 21:19:07 2015
@@ -17,6 +17,7 @@
 package org.apache.nutch.indexwriter.solr;
 
 public interface SolrConstants {
+  
   public static final String SOLR_PREFIX = "solr.";
 
   public static final String SERVER_URL = SOLR_PREFIX + "server.url";
@@ -30,7 +31,13 @@ public interface SolrConstants {
   public static final String USERNAME = SOLR_PREFIX + "auth.username";
 
   public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-
+  
+  public static final String SERVER_TYPE = SOLR_PREFIX + "server.type";
+  
+  public static final String ZOOKEEPER_URL = SOLR_PREFIX + "zookeeper.url";
+  
+  public static final String LOADBALANCE_URLS = SOLR_PREFIX + 
"loadbalance.urls";
+  
   @Deprecated
   public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
 

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 Tue Aug 18 21:19:07 2015
@@ -55,7 +55,7 @@ public class SolrIndexWriter implements
   private boolean delete = false;
 
   public void open(JobConf job, String name) throws IOException {
-    SolrServer server = SolrUtils.getCommonsHttpSolrServer(job);
+    SolrServer server = SolrUtils.getSolrServer(job);
     init(server, job);
   }
 
@@ -183,7 +183,7 @@ public class SolrIndexWriter implements
     config = conf;
     String serverURL = conf.get(SolrConstants.SERVER_URL);
     if (serverURL == null) {
-      String message = "Missing SOLR URL. Should be set via -D "
+      String message = "Missing Solr URL. Should be set via -D "
           + SolrConstants.SERVER_URL;
       message += "\n" + describe();
       LOG.error(message);
@@ -192,15 +192,20 @@ public class SolrIndexWriter implements
   }
 
   public String describe() {
-    StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+    StringBuffer sb = new StringBuffer("SolrIndexWriter\n");
+    sb.append("\t").append(SolrConstants.SERVER_TYPE)
+        .append(" : Type of SolrServer to communicate with (default 'http' 
however options include 'cloud', 'lb' and 'concurrent')\n");
     sb.append("\t").append(SolrConstants.SERVER_URL)
-        .append(" : URL of the SOLR instance (mandatory)\n");
-    sb.append("\t").append(SolrConstants.COMMIT_SIZE)
-        .append(" : buffer size when sending to SOLR (default 1000)\n");
+        .append(" : URL of the Solr instance (mandatory)\n");
+    sb.append("\t").append(SolrConstants.ZOOKEEPER_URL)
+        .append(" : URL of the Zookeeper URL (mandatory if 'cloud' value for 
solr.server.type)\n");
+    sb.append("\t").append(SolrConstants.LOADBALANCE_URLS)
+        .append(" : Comma-separated string of Solr server strings to be used 
(madatory if 'lb' value for solr.server.type)\n");
     sb.append("\t")
         .append(SolrConstants.MAPPING_FILE)
-        .append(
-            " : name of the mapping file for fields (default 
solrindex-mapping.xml)\n");
+        .append(" : name of the mapping file for fields (default 
solrindex-mapping.xml)\n");
+    sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+        .append(" : buffer size when sending to Solr (default 1000)\n");
     sb.append("\t").append(SolrConstants.USE_AUTH)
         .append(" : use authentication (default false)\n");
     sb.append("\t").append(SolrConstants.USERNAME)
@@ -209,5 +214,4 @@ public class SolrIndexWriter implements
         .append(" : password for authentication\n");
     return sb.toString();
   }
-
 }

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
 Tue Aug 18 21:19:07 2015
@@ -16,14 +16,20 @@
  */
 package org.apache.nutch.indexwriter.solr;
 
-import org.apache.commons.httpclient.HttpClient;
-import org.apache.commons.httpclient.auth.AuthScope;
-import org.apache.commons.httpclient.UsernamePasswordCredentials;
-import org.apache.commons.httpclient.params.HttpClientParams;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.UsernamePasswordCredentials;
+import org.apache.http.client.CredentialsProvider;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.mapred.JobConf;
-import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.client.solrj.impl.CloudSolrServer;
+import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
+import org.apache.solr.client.solrj.impl.HttpSolrServer;
+import org.apache.solr.client.solrj.impl.LBHttpSolrServer;
+import org.apache.solr.client.solrj.SolrServer;
 
 import java.net.MalformedURLException;
 
@@ -31,33 +37,62 @@ public class SolrUtils {
 
   public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
 
-  public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job)
+  private static SolrServer server;
+
+  public static SolrServer getSolrServer(JobConf job)
       throws MalformedURLException {
-    HttpClient client = new HttpClient();
 
+    boolean auth = job.getBoolean(SolrConstants.USE_AUTH, false);
+
+    CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
     // Check for username/password
-    if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
+    if (auth) {
       String username = job.get(SolrConstants.USERNAME);
-
       LOG.info("Authenticating as: " + username);
-
       AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
           AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
-
-      client.getState().setCredentials(
-          scope,
-          new UsernamePasswordCredentials(username, job
-              .get(SolrConstants.PASSWORD)));
-
-      HttpClientParams params = client.getParams();
-      params.setAuthenticationPreemptive(true);
-
-      client.setParams(params);
+      credentialsProvider.setCredentials(scope, 
+          new UsernamePasswordCredentials(username, 
job.get(SolrConstants.PASSWORD)));
     }
+    CloseableHttpClient client = 
+        
HttpClientBuilder.create().setDefaultCredentialsProvider(credentialsProvider).build();
 
-    String serverURL = job.get(SolrConstants.SERVER_URL);
-
-    return new CommonsHttpSolrServer(serverURL, client);
+    String solrServer = job.get(SolrConstants.SERVER_TYPE, "http");
+    String zkHost = job.get(SolrConstants.ZOOKEEPER_URL, null);
+    String solrServerUrl = job.get(SolrConstants.SERVER_URL);
+
+    switch (solrServer) {
+    case "cloud":
+      server = new CloudSolrServer(zkHost);
+      LOG.debug("CloudSolrServer selected as indexing server.");
+      break;
+    case "concurrent":
+      server = new ConcurrentUpdateSolrServer(solrServerUrl, client, 1000, 10);
+      LOG.debug("ConcurrentUpdateSolrServer selected as indexing server.");
+      break;
+    case "http":
+      if (auth) {
+        server = new HttpSolrServer(solrServerUrl, client);
+      } else {
+        server = new HttpSolrServer(solrServerUrl);
+      }
+      LOG.debug("HttpSolrServer selected as indexing server.");
+      break;
+    case "lb":
+      String[] lbServerString = 
job.get(SolrConstants.LOADBALANCE_URLS).split(",");
+      server = new LBHttpSolrServer(client, lbServerString);
+      LOG.debug("LBHttpSolrServer selected as indexing server.");
+      break;
+    default:
+      if (auth) {
+        server = new HttpSolrServer(solrServerUrl, client);
+      } else {
+        server = new HttpSolrServer(solrServerUrl);
+      }
+      LOG.debug("HttpSolrServer selected as indexing server.");
+      break;
+    }
+    return server;
   }
 
   public static String stripNonCharCodepoints(String input) {
@@ -82,4 +117,4 @@ public class SolrUtils {
 
     return retval.toString();
   }
-}
+}
\ No newline at end of file

Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Aug 18 21:19:07 2015
@@ -38,6 +38,8 @@
   <dependencies>
     <dependency org="org.apache.tika" name="tika-parsers" rev="1.8" 
conf="*->default">
      <exclude org="org.apache.tika" name="tika-core" />
+     <exclude org="org.apache.httpcomponents" name="httpclient" />
+     <exclude org="org.apache.httpcomponents" name="httpcore" />
     </dependency>
     <override module="rome" rev="0.9"/>
   </dependencies>

Copied: nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml (from 
r1693468, nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml?p2=nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1693468&r2=1696506&rev=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/build-ivy.xml Tue Aug 18 
21:19:07 2015
@@ -15,7 +15,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<project name="parse-tika" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="parsefilter-naivebayes" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
 
     <property name="ivy.install.version" value="2.1.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml Tue Aug 18 21:19:07 
2015
@@ -36,6 +36,14 @@
   </publications>
 
   <dependencies>
+
+    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
+    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
+      <exclude org="org.apache.mrunit" name="mrunit"/>
+    </dependency>
+    <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" />
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="4.10.2" />
+
   </dependencies>
   
 </ivy-module>

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml Tue Aug 18 
21:19:07 2015
@@ -25,10 +25,22 @@
       <library name="parsefilter-naivebayes.jar">
          <export name="*"/>
       </library>
-      <library name="lucene-analyzers-common-4.3.0.jar"/>
-      <library name="mahout-math-0.8.jar"/>
-      <library name="mahout-core-0.8.jar"/>
-      <library name="lucene-core-4.3.0.jar"/>     
+      <library name="commons-cli-2.0-mahout.jar"/>
+      <library name="commons-lang3-3.1.jar"/>
+      <library name="commons-math3-3.2.jar"/>
+      <library name="guava-14.0.1.jar"/>
+      <library name="jackson-core-asl-1.9.12.jar"/>
+      <library name="jackson-mapper-asl-1.9.12.jar"/>
+      <library name="lucene-analyzers-common-4.10.2.jar"/>
+      <library name="lucene-core-4.10.2.jar"/>
+      <library name="mahout-core-0.9.jar"/>
+      <library name="mahout-math-0.10.1.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="solr-commons-csv-3.5.0.jar"/>
+      <library name="t-digest-3.1.jar"/>
+      <library name="xmlpull-1.1.3.1.jar"/>
+      <library name="xpp3_min-1.1.4c.jar"/>
+      <library name="xstream-1.4.4.jar"/> 
    </runtime>
 
    <requires>
@@ -36,10 +48,9 @@
    </requires>
 
    <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
-              name="Nutch Parser Filter"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="NaiveBayesHTMLParseFilter"
-                      
class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+        name="Nutch Parser Filter" 
point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="NaiveBayesHTMLParseFilter" 
+        class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
    </extension>
 
 </plugin>

Modified: 
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1696506&r1=1696505&r2=1696506&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
 (original)
+++ 
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
 Tue Aug 18 21:19:07 2015
@@ -35,7 +35,6 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.util.Version;
 import org.apache.mahout.classifier.naivebayes.BayesUtils;
 import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
 import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
@@ -47,17 +46,12 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
 import org.apache.mahout.vectorizer.TFIDF;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import com.google.common.collect.ConcurrentHashMultiset;
 import com.google.common.collect.Multiset;
 
 public class NaiveBayesClassifier {
 
   private static NaiveBayesModel model = null;
-  private static final Logger LOG = LoggerFactory
-      .getLogger(NaiveBayesClassifier.class);
 
   public static Map<String, Integer> readDictionnary(Configuration conf,
       Path dictionnaryPath) {
@@ -114,7 +108,7 @@ public class NaiveBayesClassifier {
 
   public static String classify(String text, String modelPath,
       String labelIndexPath, String dictionaryPath, String 
documentFrequencyPath)
-      throws IOException {
+          throws IOException {
 
     Configuration configuration = new Configuration();
 
@@ -134,7 +128,7 @@ public class NaiveBayesClassifier {
         new Path(documentFrequencyPath));
 
     // analyzer used to extract word from text
-    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+    Analyzer analyzer = new StandardAnalyzer();
 
     // int labelCount = labels.size();
     int documentCount = documentFrequency.get(-1).intValue();


Reply via email to