Author: jorgelbg
Date: Thu Oct 22 15:26:34 2015
New Revision: 1710033

URL: http://svn.apache.org/viewvc?rev=1710033&view=rev
Log:
NUTCH-2139 Basic plugin to index inlinks and outlinks


Added:
    nutch/trunk/src/plugin/index-links/
    nutch/trunk/src/plugin/index-links/build.xml
    nutch/trunk/src/plugin/index-links/ivy.xml
    nutch/trunk/src/plugin/index-links/plugin.xml
    nutch/trunk/src/plugin/index-links/src/
    nutch/trunk/src/plugin/index-links/src/java/
    nutch/trunk/src/plugin/index-links/src/java/org/
    nutch/trunk/src/plugin/index-links/src/java/org/apache/
    nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/
    
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
    nutch/trunk/src/plugin/index-links/src/test/
    nutch/trunk/src/plugin/index-links/src/test/org/
    nutch/trunk/src/plugin/index-links/src/test/org/apache/
    nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/
    
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 22 15:26:34 2015
@@ -2,6 +2,8 @@ Nutch Change Log
     
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2139 Basic plugin to index inlinks and outlinks (jorgelbg)
+
 * NUTCH-2128 Review and update mapred --> mapreduce config params in crawl 
script (lewismc)
 
 * NUTCH-2141 Change the InteractiveSelenium plugin handler Interface to return 
page content

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Oct 22 15:26:34 2015
@@ -178,6 +178,7 @@
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-replace/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
+      <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -589,6 +590,7 @@
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/index-replace/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
+      <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
@@ -979,6 +981,8 @@
         <source path="${plugins.dir}/index-basic/src/java/" />
         <source path="${plugins.dir}/index-basic/src/test/" />
         <source path="${plugins.dir}/index-geoip/src/java/" />
+        <source path="${plugins.dir}/index-links/src/java/" />
+        <source path="${plugins.dir}/index-links/src/test/" />
         <source path="${plugins.dir}/mimetype-filter/src/java/" />
         <source path="${plugins.dir}/mimetype-filter/src/test/" />
         <source path="${plugins.dir}/indexer-dummy/src/java/" />

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Oct 22 15:26:34 2015
@@ -1896,4 +1896,37 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<!-- index-links plugin -->
+
+<property>
+  <name>index.links.outlinks.host.ignore</name>
+  <value>false</value>
+  <description>
+    Ignore outlinks that point out to the same host as the URL being indexed. 
+    By default all outlinks are indexed. If db.ignore.internal.links is true 
(default
+    value), this setting does nothing since the internal links are already
+    ignored.
+  </description>
+</property>
+
+<property>
+  <name>index.links.inlinks.host.ignore</name>
+  <value>false</value>
+  <description>
+    Ignore inlinks coming from the same host as the URL being indexed. By 
default 
+    all inlinks are indexed. If db.ignore.internal.links is true (default
+    value), this setting does nothing since the internal links are already
+    ignored.
+  </description>
+</property>
+
+<property>
+  <name>index.links.hosts.only</name>
+  <value>false</value>
+  <description>
+    This force the index-links plugin to only index the host portion of the 
inlinks
+    or outlinks.
+  </description>
+</property>
+
 </configuration>

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Oct 22 15:26:34 2015
@@ -36,6 +36,7 @@
      <ant dir="index-replace" target="deploy"/>
      <ant dir="index-static" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
+     <ant dir="index-links" target="deploy"/>
      <ant dir="mimetype-filter" target="deploy"/>
      <ant dir="indexer-cloudsearch" target="deploy"/>
      <ant dir="indexer-dummy" target="deploy"/>
@@ -98,6 +99,7 @@
      <ant dir="index-more" target="test"/>
      <ant dir="index-static" target="test"/>
      <ant dir="index-replace" target="test"/>
+     <ant dir="index-links" target="test"/>
      <ant dir="mimetype-filter" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="lib-http" target="test"/>
@@ -143,6 +145,7 @@
     <ant dir="index-static" target="clean"/>
     <ant dir="index-replace" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
+    <ant dir="index-links" target="clean"/>
     <ant dir="mimetype-filter" target="clean"/>
     <ant dir="indexer-cloudsearch" target="clean"/>
     <ant dir="indexer-dummy" target="clean"/>

Added: nutch/trunk/src/plugin/index-links/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/build.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/build.xml (added)
+++ nutch/trunk/src/plugin/index-links/build.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-links" default="jar-core">
+
+    <import file="../build-plugin.xml"/>
+
+</project>

Added: nutch/trunk/src/plugin/index-links/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/ivy.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/ivy.xml (added)
+++ nutch/trunk/src/plugin/index-links/ivy.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/index-links/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/plugin.xml?rev=1710033&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-links/plugin.xml (added)
+++ nutch/trunk/src/plugin/index-links/plugin.xml Thu Oct 22 15:26:34 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+    id="index-links"
+    name="Index inlinks and outlinks"
+    version="1.0.0"
+    provider-name="nutch.org">
+
+    <runtime>
+        <library name="index-links.jar">
+            <export name="*"/>
+        </library>
+    </runtime>
+
+    <requires>
+        <import plugin="nutch-extensionpoints"/>
+    </requires>
+
+    <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+               name="Links indexing filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+                        
class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
+    </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java?rev=1710033&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
 (added)
+++ 
nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
 Thu Oct 22 15:26:34 2015
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
+ * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
+ *
+ * In case that you want to ignore the outlinks that point to the same host
+ * as the URL being indexed use the following settings in your configuration
+ * file:
+ *
+ * <property>
+ *   <name>outlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * The same configuration is available for inlinks:
+ *
+ * <property>
+ *   <name>inlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * To store only the host portion of each inlink URL or outlink URL add the
+ * following to your configuration file.
+ *
+ * <property>
+ *   <name>links.hosts.only</name>
+ *   <value>false</value>
+ * </property>
+ *
+ */
+public class LinksIndexingFilter implements IndexingFilter {
+
+  public final static String LINKS_OUTLINKS_HOST = 
"index.links.outlinks.host.ignore";
+  public final static String LINKS_INLINKS_HOST = 
"index.links.inlinks.host.ignore";
+  public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
+
+  public final static org.slf4j.Logger LOG = LoggerFactory
+      .getLogger(LinksIndexingFilter.class);
+
+  private Configuration conf;
+  private boolean filterOutlinks;
+  private boolean filterInlinks;
+  private boolean indexHost;
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // Add the outlinks
+    Outlink[] outlinks = parse.getData().getOutlinks();
+
+    if (outlinks != null) {
+      Set<String> hosts = new HashSet<String>();
+
+      for (Outlink outlink : outlinks) {
+        try {
+          String linkUrl = outlink.getToUrl();
+          String outHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = outHost;
+
+            if (hosts.contains(linkUrl))
+              continue;
+
+            hosts.add(linkUrl);
+          }
+
+          addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
+              filterOutlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    // Add the inlinks
+    if (null != inlinks) {
+      Iterator<Inlink> iterator = inlinks.iterator();
+      Set<String> inlinkHosts = new HashSet<String>();
+
+      while (iterator.hasNext()) {
+        try {
+          Inlink link = iterator.next();
+          String linkUrl = link.getFromUrl();
+          String inHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = inHost;
+
+            if (inlinkHosts.contains(linkUrl))
+              continue;
+
+            inlinkHosts.add(linkUrl);
+          }
+
+          addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
+              filterInlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  private void addFilteredLink(String fieldName, String url, String linkUrl,
+      String urlHost, boolean filter, NutchDocument doc) throws 
MalformedURLException {
+      if (filter) {
+        String host = new URL(url.toString()).getHost().toLowerCase();
+
+        if (!host.equalsIgnoreCase(urlHost)) {
+          doc.add(fieldName, linkUrl);
+        }
+      } else {
+        doc.add(fieldName, linkUrl);
+      }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
+    filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
+
+    indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

Added: 
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java?rev=1710033&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
 (added)
+++ 
nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
 Thu Oct 22 15:26:34 2015
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+  Configuration conf = NutchConfiguration.create();
+  LinksIndexingFilter filter = new LinksIndexingFilter();
+  Metadata metadata = new Metadata();
+
+  @Before
+  public void setUp() throws Exception {
+    metadata.add(Response.CONTENT_TYPE, "text/html");
+  }
+
+  private Outlink[] generateOutlinks() throws Exception {
+    return generateOutlinks(false);
+  }
+
+  private Outlink[] generateOutlinks(boolean parts) throws Exception {
+    Outlink[] outlinks = new Outlink[2];
+
+    outlinks[0] = new Outlink("http://www.test.com";, "test");
+    outlinks[1] = new Outlink("http://www.example.com";, "example");
+
+    if (parts) {
+      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+          "test");
+      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+          "test");
+    }
+
+    return outlinks;
+  }
+
+  @Test
+  public void testFilterOutlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals("Filter outlinks, allow only those from a different 
host",
+        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Filter inlinks, allow only those from a different 
host",
+        "http://www.test.com";, doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testNoFilterOutlinks() throws Exception {
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("All outlinks must be indexed even those from the same 
host",
+        outlinks.length, doc.getField("outlinks").getValues().size());
+  }
+
+  @Test
+  public void testNoFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals("All inlinks must be indexed even those from the same 
host",
+        inlinks.size(), doc.getField("inlinks").getValues().size());
+  }
+
+  @Test
+  public void testIndexOnlyHostPart() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com/one-awesome-page";, "test"));
+    inlinks.add(new Inlink("http://www.test.com/other-awesome-page";, "test"));
+    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example";,
+        "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    NutchField docOutlinks = doc.getField("outlinks");
+
+    Assert.assertEquals("Only the host portion of the outlink URL must be 
indexed",
+        new URL("http://www.test.com";).getHost(),
+        docOutlinks.getValues().get(0));
+
+    Assert.assertEquals(
+        "The inlinks coming from the same host must count only once", 1,
+        doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Only the host portion of the inlinks URL must be 
indexed",
+        new URL("http://www.test.com";).getHost(), 
doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    filter.setConf(conf);
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the outlinks after filtering",
+        new URL("http://www.test.com";).getHost(),
+        doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com";, "test"));
+    inlinks.add(new Inlink("http://www.example.com";, "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], 
metadata)),
+        new Text("http://www.example.com/";), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the inlinks after filtering",
+        new URL("http://www.test.com";).getHost(),
+        doc.getFieldValue("inlinks"));
+
+  }
+}
\ No newline at end of file


Reply via email to