Author: lewismc
Date: Sat Jan 10 23:24:58 2015
New Revision: 1650829

URL: http://svn.apache.org/r1650829
Log:
NUTCH-1660 Index filter for Page's latitude and longitudex

Added:
    nutch/trunk/src/plugin/index-geoip/
    nutch/trunk/src/plugin/index-geoip/build.xml
    nutch/trunk/src/plugin/index-geoip/ivy.xml
    nutch/trunk/src/plugin/index-geoip/plugin.xml
    nutch/trunk/src/plugin/index-geoip/src/
    nutch/trunk/src/plugin/index-geoip/src/java/
    nutch/trunk/src/plugin/index-geoip/src/java/org/
    nutch/trunk/src/plugin/index-geoip/src/java/org/apache/
    nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/
    
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
    
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
    
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
    nutch/trunk/src/plugin/index-geoip/src/test/
    nutch/trunk/src/plugin/index-geoip/src/test/org/
    nutch/trunk/src/plugin/index-geoip/src/test/org/apache/
    nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-geoip/src/test/org/apache/nutch/indexer/geoip/
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/conf/schema-solr4.xml
    nutch/trunk/conf/schema.xml
    nutch/trunk/conf/solrindex-mapping.xml
    nutch/trunk/default.properties
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Jan 10 23:24:58 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1660 Index filter for Page's latitude and longitude (Yasin Kılınç, 
lewismc)
+
 * NUTCH-1140 index-more plugin, resetTitle creates multiple values in title 
field (Joe Liedtke, kaveh minooie via snagel)
 
 * NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Sat Jan 10 23:24:58 2015
@@ -176,6 +176,7 @@
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
+      <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -579,6 +580,7 @@
       <packageset dir="${plugins.dir}/headings/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
+      <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
@@ -958,6 +960,8 @@
         <source path="${plugins.dir}/index-anchor/src/test/" />
         <source path="${plugins.dir}/index-basic/src/java/" />
         <source path="${plugins.dir}/index-basic/src/test/" />
+        <source path="${plugins.dir}/index-geoip/src/java/" />
+        <source path="${plugins.dir}/index-geoip/src/test/" />
         <source path="${plugins.dir}/indexer-dummy/src/java/" />
         <source path="${plugins.dir}/indexer-solr/src/java/" />
         <source path="${plugins.dir}/indexer-elastic/src/java/" />

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Sat Jan 10 23:24:58 2015
@@ -1373,6 +1373,36 @@
   </description>
 </property>
 
+<!-- index-geoip plugin properties -->
+<property>
+  <name>index.geoip.usage</name>
+  <value>insightsService</value>
+  <description>
+  A string representing the information source to be used for GeoIP information
+  association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
+  'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any 
one of the 
+  Database options, you should make one of GeoIP2-City.mmdb, 
GeoIP2-Connection-Type.mmdb, 
+  GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the 
classpath and
+  available at runtime.
+  </description>
+</property>
+
+<property>
+  <name>index.geoip.userid</name>
+  <value></value>
+  <description>
+  The userId associated with the GeoIP2 Precision Services account.
+  </description>
+</property>
+
+<property>
+  <name>index.geoip.licensekey</name>
+  <value></value>
+  <description>
+  The license key associated with the GeoIP2 Precision Services account.
+  </description>
+</property>
+
 <!-- parse-metatags plugin properties -->
 <property>
   <name>metatags.names</name>

Modified: nutch/trunk/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Sat Jan 10 23:24:58 2015
@@ -79,7 +79,9 @@
          Note: For faster range queries, consider the tdate type
       -->
     <fieldType name="date" class="solr.TrieDateField" omitNorms="true" 
precisionStep="0" positionIncrementGap="0"/>
-
+    
+    <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
+    
     <!-- A Trie based date field for faster date range queries and date 
faceting. -->
     <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" 
precisionStep="6" positionIncrementGap="0"/>
 
@@ -298,6 +300,25 @@
          any data added to them will be ignored outright.  --> 
     <fieldtype name="ignored" stored="false" indexed="false" 
multiValued="true" class="solr.StrField" />
 
+        <!-- boolean type: "true" or "false" -->
+        <fieldType name="boolean" class="solr.BoolField" 
sortMissingLast="true"/>
+
+         <!-- sortMissingLast and sortMissingFirst attributes are optional 
attributes are
+         currently supported on types that are sorted internally as strings
+         and on numeric types.
+         This includes "string","boolean", and, as of 3.5 (and 4.x),
+         int, float, long, date, double, including the "Trie" variants.
+       - If sortMissingLast="true", then a sort on this field will cause 
documents
+         without the field to come after documents with the field,
+         regardless of the requested sort order (asc or desc).
+       - If sortMissingFirst="true", then a sort on this field will cause 
documents
+         without the field to come before documents with the field,
+         regardless of the requested sort order.
+       - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+         then default lucene sorting will be used which places docs without the
+         field first in an ascending sort and last in a descending sort.
+         -->
+
  </types>
 
  <fields>
@@ -318,6 +339,41 @@
     <field name="cache" type="string" stored="true" indexed="false"/>
     <field name="tstamp" type="date" stored="true" indexed="false"/>
 
+    <!-- fields for index-geoip plugin -->
+    <field name="ip" type="string" stored="true" indexed="true" />
+    <field name="cityName" type="string" stored="true" indexed="true" />
+    <field name="cityConfidence" type="int" stored="true" indexed="true" />
+    <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+    <field name="continentCode" type="string" stored="true" indexed="true" />
+    <field name="continentGeoNameId" type="int" stored="true" indexed="true" />
+    <field name="contentName" type="string" stored="true" indexed="true" />
+    <field name="countryIsoCode" type="string" stored="true" indexed="true"/>
+    <field name="countryName" type="string" stored="true" indexed="true" />
+    <field name="countryConfidence" type="int" stored="true" indexed="true"/>
+    <field name="countryGeoNameId" type="int" stored="true" indexed="true"/>
+    <field name="latLon" type="string" stored="true" indexed="true"/>
+    <field name="accRadius" type="int" stored="true" indexed="true"/>
+    <field name="timeZone" type="string" stored="true" indexed="true"/>
+    <field name="metroCode" type="int" stored="true" indexed="true" />
+    <field name="postalCode" type="string" stored="true" indexed="true" />
+    <field name="postalConfidence" type="int" stored="true" indexed="true" />
+    <field name="countryType" type="string" stored="true" indexed="true" />
+    <field name="subDivName" type="string" stored="true" indexed="true" />
+    <field name="subDivIsoCode" type="string" stored="true" indexed="true" />
+    <field name="subDivConfidence" type="int" stored="true" indexed="true" />
+    <field name="subDivGeoNameId" type="int" stored="true" indexed="true" /> 
+    <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+    <field name="autonSystemOrg" type="string" stored="true" indexed="true" />
+    <field name="domain" type="string" stored="true" indexed="true" />
+    <field name="isp" type="string" stored="true" indexed="true" />
+    <field name="org" type="string" stored="true" indexed="true" />
+    <field name="userType" type="string" stored="true" indexed="true" />
+    <field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
+    <field name="isSatelitteProv" type="boolean" stored="true" indexed="true" 
/>
+    <field name="connType" type="string" stored="true" indexed="true" />
+
+    <dynamicField name="*_coordinate" type="tdouble" indexed="true" 
stored="false"/>
+
     <!-- catch-all field -->
     <field name="text" type="text_general" stored="false" indexed="true" 
multiValued="true"/>
 
@@ -363,5 +419,5 @@
  <copyField source="title" dest="text"/>
  <copyField source="anchor" dest="text"/>
  <copyField source="author" dest="text"/>
-
+ <copyField source="latLon" dest="location"/>
 </schema>

Modified: nutch/trunk/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Sat Jan 10 23:24:58 2015
@@ -38,6 +38,7 @@
             omitNorms="true" positionIncrementGap="0"/>
         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>
+        <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
 
         <fieldType name="text" class="solr.TextField"
             positionIncrementGap="100">
@@ -64,6 +65,24 @@
                     generateWordParts="1" generateNumberParts="1"/>
             </analyzer>
         </fieldType>
+            <!-- boolean type: "true" or "false" -->
+        <fieldType name="boolean" class="solr.BoolField" 
sortMissingLast="true"/>
+
+         <!-- sortMissingLast and sortMissingFirst attributes are optional 
attributes are
+         currently supported on types that are sorted internally as strings
+         and on numeric types.
+         This includes "string","boolean", and, as of 3.5 (and 4.x),
+         int, float, long, date, double, including the "Trie" variants.
+       - If sortMissingLast="true", then a sort on this field will cause 
documents
+         without the field to come after documents with the field,
+         regardless of the requested sort order (asc or desc).
+       - If sortMissingFirst="true", then a sort on this field will cause 
documents
+         without the field to come before documents with the field,
+         regardless of the requested sort order.
+       - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+         then default lucene sorting will be used which places docs without the
+         field first in an ascending sort and last in a descending sort.
+         -->  
     </types>
     <fields>
         <field name="id" type="string" stored="true" indexed="true"
@@ -83,6 +102,43 @@
         <field name="title" type="text" stored="true" indexed="true"/>
         <field name="cache" type="string" stored="true" indexed="false"/>
         <field name="tstamp" type="date" stored="true" indexed="false"/>
+        
+        <!-- fields for index-geoip plugin -->
+        <field name="ip" type="string" stored="true" indexed="true" />
+        <field name="cityName" type="string" stored="true" indexed="true" />
+        <field name="cityConfidence" type="int" stored="true" indexed="true" />
+        <field name="cityGeoNameId" type="int" stored="true" indexed="true" />
+        <field name="continentCode" type="string" stored="true" indexed="true" 
/>
+        <field name="continentGeoNameId" type="int" stored="true" 
indexed="true" />
+        <field name="contentName" type="string" stored="true" indexed="true" />
+        <field name="countryIsoCode" type="string" stored="true" 
indexed="true"/>
+        <field name="countryName" type="string" stored="true" indexed="true" />
+        <field name="countryConfidence" type="int" stored="true" 
indexed="true"/>
+        <field name="countryGeoNameId" type="int" stored="true" 
indexed="true"/>        
+        <field name="latLon" type="string" stored="true" indexed="true"/>
+        <field name="accRadius" type="int" stored="true" indexed="true"/>
+        <field name="timeZone" type="string" stored="true" indexed="true"/>
+        <field name="metroCode" type="int" stored="true" indexed="true" />
+        <field name="postalCode" type="string" stored="true" indexed="true" />
+        <field name="postalConfidence" type="int" stored="true" indexed="true" 
/>
+        <field name="countryType" type="string" stored="true" indexed="true" />
+        <field name="subDivName" type="string" stored="true" indexed="true" />
+        <field name="subDivIsoCode" type="string" stored="true" indexed="true" 
/>
+        <field name="subDivConfidence" type="int" stored="true" indexed="true" 
/>
+        <field name="subDivGeoNameId" type="int" stored="true" indexed="true" 
/>
+        <field name="autonSystemNum" type="int" stored="true" indexed="true" />
+        <field name="autonSystemOrg" type="string" stored="true" 
indexed="true" />
+        <field name="domain" type="string" stored="true" indexed="true" />
+        <field name="isp" type="string" stored="true" indexed="true" />
+        <field name="org" type="string" stored="true" indexed="true" />
+        <field name="userType" type="string" stored="true" indexed="true" />
+        <field name="isAnonProxy" type="boolean" stored="true" indexed="true" 
/>
+        <field name="isSatelitteProv" type="boolean" stored="true" 
indexed="true" />
+        <field name="connType" type="string" stored="true" indexed="true" />
+        
+
+        
+        <dynamicField name="*_coordinate" type="tdouble" indexed="true" 
stored="false"/>
 
         <!-- fields for index-anchor plugin -->
         <field name="anchor" type="string" stored="true" indexed="true"
@@ -137,5 +193,6 @@
     <copyField source="title" dest="text"/>
     <copyField source="anchor" dest="text"/>
     <copyField source="author" dest="text"/>
+    <copyField source="latLon" dest="location"/>
 
 </schema>

Modified: nutch/trunk/conf/solrindex-mapping.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/solrindex-mapping.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/conf/solrindex-mapping.xml (original)
+++ nutch/trunk/conf/solrindex-mapping.xml Sat Jan 10 23:24:58 2015
@@ -17,8 +17,8 @@
 -->
 
 <mapping>
-       <!-- Simple mapping of fields created by Nutch IndexingFilters
-            to fields defined (and expected) in Solr schema.xml.
+  <!-- Simple mapping of fields created by Nutch IndexingFilters
+       to fields defined (and expected) in Solr schema.xml.
 
              Any fields in NutchDocument that match a name defined
              in field/@source will be renamed to the corresponding
@@ -30,14 +30,14 @@
              uniqueKey has the same meaning as in Solr schema.xml
              and defaults to "id" if not defined.
          -->
-       <fields>
-               <field dest="content" source="content"/>
-               <field dest="title" source="title"/>
-               <field dest="host" source="host"/>
-               <field dest="segment" source="segment"/>
-               <field dest="boost" source="boost"/>
-               <field dest="digest" source="digest"/>
-               <field dest="tstamp" source="tstamp"/>
-       </fields>
-       <uniqueKey>id</uniqueKey>
+  <fields>
+    <field dest="content" source="content"/>
+    <field dest="title" source="title"/>
+    <field dest="host" source="host"/>
+    <field dest="segment" source="segment"/>
+    <field dest="boost" source="boost"/>
+    <field dest="digest" source="digest"/>
+    <field dest="tstamp" source="tstamp"/>
+  </fields>
+  <uniqueKey>id</uniqueKey>
 </mapping>

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Sat Jan 10 23:24:58 2015
@@ -148,6 +148,7 @@ plugins.index=\
    org.apache.nutch.indexer.anchor*:\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
+   org.apache.nutch.indexer.geoip*:\
    org.apache.nutch.indexer.metadata*:\
    org.apache.nutch.indexer.more*:\
    org.apache.nutch.indexer.static*:\

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Sat Jan 10 23:24:58 2015
@@ -17,7 +17,6 @@
  
 package org.apache.nutch.indexer;
 
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1650829&r1=1650828&r2=1650829&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Sat Jan 10 23:24:58 2015
@@ -31,6 +31,7 @@
      <ant dir="headings" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
+     <ant dir="index-geoip" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="index-static" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
@@ -83,6 +84,7 @@
      <ant dir="creativecommons" target="test"/>
      <ant dir="index-basic" target="test"/>
      <ant dir="index-anchor" target="test"/>
+     <ant dir="index-geoip" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="index-static" target="test"/>
      <ant dir="language-identifier" target="test"/>
@@ -122,6 +124,7 @@
     <ant dir="headings" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
+     <ant dir="index-geoip" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="index-static" target="clean"/>
     <ant dir="index-metadata" target="clean"/>

Added: nutch/trunk/src/plugin/index-geoip/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/build.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/build.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/build.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  <target name="init-plugin">
+    <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
+    <copy todir="${build.classes}">
+      <fileset dir="${src.dir}" includes="**/*.mmdb" />
+    </copy>
+  </target>
+</project>

Added: nutch/trunk/src/plugin/index-geoip/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/ivy.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/ivy.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/ivy.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.1.0" />
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/index-geoip/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/plugin.xml?rev=1650829&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/plugin.xml (added)
+++ nutch/trunk/src/plugin/index-geoip/plugin.xml Sat Jan 10 23:24:58 2015
@@ -0,0 +1,43 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-geoip"
+   name="GeoIP2 Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-geoip.jar">
+         <export name="*"/>
+      </library>
+      <library name="geoip2-2.1.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.geoip"
+              name="Nutch GeoIP2 Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="GeoIPIndexingFilter"
+                      
class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
+   </extension>
+
+</plugin>
+

Added: 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java?rev=1650829&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
 (added)
+++ 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
 Sat Jan 10 23:24:58 2015
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.InsightsResponse;
+import com.maxmind.geoip2.model.CityResponse;
+import com.maxmind.geoip2.model.ConnectionTypeResponse;
+import com.maxmind.geoip2.model.CountryResponse;
+import com.maxmind.geoip2.model.DomainResponse;
+import com.maxmind.geoip2.model.IspResponse;
+import com.maxmind.geoip2.record.City;
+import com.maxmind.geoip2.record.Continent;
+import com.maxmind.geoip2.record.Country;
+import com.maxmind.geoip2.record.Location;
+import com.maxmind.geoip2.record.Postal;
+import com.maxmind.geoip2.record.RepresentedCountry;
+import com.maxmind.geoip2.record.Subdivision;
+import com.maxmind.geoip2.record.Traits;
+
+/**
+ * <p>Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input 
+ * from {@link GeoIPIndexingFilter}, where configuration is also read.</p>
+ * <p>Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link 
org.apache.nutch.indexer.NutchDocument}'s
+ * with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ *
+ */
+public class GeoIPDocumentCreator {
+
+  /**
+   * Default constructor.
+   */
+  public GeoIPDocumentCreator() {
+  }
+
+  public static NutchDocument createDocFromInsightsService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException, 
IOException, GeoIp2Exception {
+    doc.add("ip", serverIp);
+    InsightsResponse response = 
client.insights(InetAddress.getByName(serverIp));
+    //CityResponse response = client.city(InetAddress.getByName(serverIp));
+    
+    City city = response.getCity();
+    doc.add("cityName", city.getName());       // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode());            // 'US'
+    doc.add("countryName", country.getName());               // 'United States'
+    doc.add("countryConfidence", country.getConfidence());         // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); 
   // 44.9733, -93.2323
+    doc.add("accRadius", location.getAccuracyRadius());  // 3
+    doc.add("timeZone", location.getTimeZone());        // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode());       // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName());       // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode());    // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+
+    Traits traits = response.getTraits(); 
+    doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
+    doc.add("domain", traits.getDomain());
+    doc.add("isp", traits.getIsp());
+    doc.add("org", traits.getOrganization());
+    doc.add("userType", traits.getUserType());
+    doc.add("isAnonProxy", traits.isAnonymousProxy());
+    doc.add("isSatelliteProv", traits.isSatelliteProvider());
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCityService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException, 
IOException, GeoIp2Exception {
+    CityResponse response = client.city(InetAddress.getByName(serverIp));
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCountryService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException, 
IOException, GeoIp2Exception {
+    CountryResponse response = 
client.country(InetAddress.getByName(serverIp));    
+    return doc;
+  }
+
+  public static NutchDocument createDocFromIspDb(String serverIp, 
NutchDocument doc, 
+      DatabaseReader reader) throws UnknownHostException, IOException, 
GeoIp2Exception {
+    IspResponse response = reader.isp(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("autonSystemNum", response.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
+    doc.add("isp", response.getIsp());
+    doc.add("org", response.getOrganization());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromDomainDb(String serverIp, 
NutchDocument doc, 
+      DatabaseReader reader) throws UnknownHostException, IOException, 
GeoIp2Exception {
+    DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("domain", response.getDomain());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromConnectionDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException, 
IOException, GeoIp2Exception {
+    ConnectionTypeResponse response = 
reader.connectionType(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("connType", response.getConnectionType().toString());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromCityDb(String serverIp, 
NutchDocument doc, 
+      DatabaseReader reader) throws UnknownHostException, IOException, 
GeoIp2Exception {
+    doc.add("ip", serverIp);
+    CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+    City city = response.getCity();
+    doc.add("cityName", city.getName());       // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode());            // 'US'
+    doc.add("countryName", country.getName());               // 'United States'
+    doc.add("countryConfidence", country.getConfidence());         // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); 
   // 44.9733, -93.2323
+    doc.add("accRadius", location.getAccuracyRadius());  // 3
+    doc.add("timeZone", location.getTimeZone());        // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode());       // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName());       // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode());    // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+    return doc;
+  }
+
+}

Added: 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java?rev=1650829&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
 (added)
+++ 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
 Sat Jan 10 23:24:58 2015
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+
+/**
+ * <p>This plugin implements an indexing filter which takes 
+ * advantage of the 
+ * <a href="https://github.com/maxmind/GeoIP2-java";>GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services";>Precision web 
services</a> 
+ * and <a 
href="http://dev.maxmind.com/geoip/geoip2/downloadable";>databases</a>. 
+ * The API also works with the free 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/";>GeoLite2 
databases</a>.</p>
+ * <p>Depending on the service level agreement, you have with the GeoIP 
service provider,
+ * the plugin can add a number of the following fields to the index data model:
+ * <ol>
+ * <li>Continent</li>
+ * <li>Country</li>
+ * <li>Regional Subdivision</li>
+ * <li>City</li>
+ * <li>Postal Code</li>
+ * <li>Latitude/Longitude</li>
+ * <li>ISP/Organization</li>
+ * <li>AS Number</li>
+ * <li>Confidence Factors</li>
+ * <li>Radius</li>
+ * <li>User Type</li>
+ * </ol></p>
+ * 
+ * <p>Some of the services are documented at the 
+ * <a href="https://www.maxmind.com/en/geoip2-precision-services";>GeoIP2 
Precision Services</a>
+ * webpage where more information can be obtained.</p>
+ * 
+ * <p>You should also consult the following three properties in 
<code>nutch-site.xml</code></p>
+ * <pre>
+ * {@code
+ *<!-- index-geoip plugin properties -->
+<property>
+  <name>index.geoip.usage</name>
+  <value>insightsService</value>
+  <description>
+  A string representing the information source to be used for GeoIP information
+  association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
+  'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any 
one of the 
+  Database options, you should make one of GeoIP2-City.mmdb, 
GeoIP2-Connection-Type.mmdb, 
+  GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the 
Hadoop classpath 
+  and available at runtime. This can be achieved by adding it to 
$NUTCH_HOME/conf
+  </description>
+</property>
+
+<property>
+  <name>index.geoip.userid</name>
+  <value></value>
+  <description>
+  The userId associated with the GeoIP2 Precision Services account.
+  </description>
+</property>
+
+<property>
+  <name>index.geoip.licensekey</name>
+  <value></value>
+  <description>
+  The license key associated with the GeoIP2 Precision Services account.
+  </description>
+</property>
+}
+ * </pre>
+ * 
+ */
+public class GeoIPIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(GeoIPIndexingFilter.class);
+
+  private Configuration conf;
+
+  private String usage = null;
+
+  private File geoDb = null;
+
+  WebServiceClient client = null;
+
+  DatabaseReader reader = null;
+
+  //private AbstractResponse response = null;
+
+  /**
+   * Default constructor for this plugin
+   */
+  public GeoIPIndexingFilter() {
+  }
+
+  /**
+   * @see org.apache.hadoop.conf.Configurable#getConf()
+   */
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * @see 
org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String use = conf.get("index.geoip.usage", "insightsService");
+    LOG.debug("GeoIP usage medium set to: {}", use);
+    if (use.equalsIgnoreCase("cityDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
+      try {
+        geoDb = new 
File(conf.getResource("GeoIP2-Connection-Type.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("domainDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("ispDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("insightsService")) {
+      client = new WebServiceClient.Builder(
+          conf.getInt("index.geoip.userid", 12345), 
conf.get("index.geoip.licensekey")).build();
+    }
+    usage = use;
+  }
+
+  private void buildDb() {
+    try {
+      reader = new DatabaseReader.Builder(geoDb).build();
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  /**
+   * 
+   * @see 
org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
 org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, 
org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+   */
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    return addServerGeo(doc, parse.getData(), url.toString());
+  }
+
+  private NutchDocument addServerGeo(NutchDocument doc, ParseData data, String 
url) {
+
+    if (conf.getBoolean("store.ip.address", false) == true) {
+      try {
+        String serverIp = data.getContentMeta().get("_ip_");
+        if (serverIp != null) {
+          if (usage.equalsIgnoreCase("cityDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, 
reader);
+          } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, 
doc, reader);
+          } else if (usage.equalsIgnoreCase("domainDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, 
reader);
+          } else if (usage.equalsIgnoreCase("ispDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, 
reader);
+          } else if (usage.equalsIgnoreCase("insightsService")) {
+            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, 
doc, client);
+          }
+        }
+      } catch (Exception e) {
+        LOG.error(e.getMessage());
+        e.printStackTrace();
+      }
+    }
+    return doc;
+  }
+
+}

Added: 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java?rev=1650829&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
 Sat Jan 10 23:24:58 2015
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>This plugin implements an indexing filter which takes 
+ * advantage of the 
+ * <a href="https://github.com/maxmind/GeoIP2-java";>GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services";>Precision web 
services</a> 
+ * and <a 
href="http://dev.maxmind.com/geoip/geoip2/downloadable";>databases</a>. 
+ * The API also works with the free 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/";>GeoLite2 
databases</a>.
+ *
+ */
+package org.apache.nutch.indexer.geoip;
\ No newline at end of file


Reply via email to