Author: jorgelbg Date: Thu Oct 22 15:26:34 2015 New Revision: 1710033 URL: http://svn.apache.org/viewvc?rev=1710033&view=rev Log: NUTCH-2139 Basic plugin to index inlinks and outlinks
Added: nutch/trunk/src/plugin/index-links/ nutch/trunk/src/plugin/index-links/build.xml nutch/trunk/src/plugin/index-links/ivy.xml nutch/trunk/src/plugin/index-links/plugin.xml nutch/trunk/src/plugin/index-links/src/ nutch/trunk/src/plugin/index-links/src/java/ nutch/trunk/src/plugin/index-links/src/java/org/ nutch/trunk/src/plugin/index-links/src/java/org/apache/ nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/ nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/ nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/ nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java nutch/trunk/src/plugin/index-links/src/test/ nutch/trunk/src/plugin/index-links/src/test/org/ nutch/trunk/src/plugin/index-links/src/test/org/apache/ nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/ nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/ nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/ nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/build.xml nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1710033&r1=1710032&r2=1710033&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Oct 22 15:26:34 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-2139 Basic plugin to index inlinks and outlinks (jorgelbg) + * NUTCH-2128 Review and update mapred --> mapreduce config params in crawl script (lewismc) * NUTCH-2141 Change the InteractiveSelenium plugin handler Interface to return page content Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff ============================================================================== --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Thu Oct 22 15:26:34 2015 @@ -178,6 +178,7 @@ <packageset dir="${plugins.dir}/index-geoip/src/java"/> <packageset dir="${plugins.dir}/index-replace/src/java"/> <packageset dir="${plugins.dir}/index-static/src/java"/> + <packageset dir="${plugins.dir}/index-links/src/java"/> <packageset dir="${plugins.dir}/mimetype-filter/src/java"/> <packageset dir="${plugins.dir}/indexer-dummy/src/java"/> <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> @@ -589,6 +590,7 @@ <packageset dir="${plugins.dir}/index-more/src/java"/> <packageset dir="${plugins.dir}/index-replace/src/java"/> <packageset dir="${plugins.dir}/index-static/src/java"/> + <packageset dir="${plugins.dir}/index-links/src/java"/> <packageset dir="${plugins.dir}/mimetype-filter/src/java"/> <packageset dir="${plugins.dir}/indexer-dummy/src/java"/> <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> @@ -979,6 +981,8 @@ <source path="${plugins.dir}/index-basic/src/java/" /> <source path="${plugins.dir}/index-basic/src/test/" /> <source path="${plugins.dir}/index-geoip/src/java/" /> + <source path="${plugins.dir}/index-links/src/java/" /> + <source path="${plugins.dir}/index-links/src/test/" /> <source path="${plugins.dir}/mimetype-filter/src/java/" /> <source path="${plugins.dir}/mimetype-filter/src/test/" /> <source path="${plugins.dir}/indexer-dummy/src/java/" /> Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1710033&r1=1710032&r2=1710033&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Thu Oct 22 15:26:34 2015 @@ -1896,4 +1896,37 @@ CAUTION: Set the parser.timeout to -1 or </description> </property> +<!-- index-links plugin --> + +<property> + <name>index.links.outlinks.host.ignore</name> + <value>false</value> + <description> + Ignore outlinks that point out to the same host as the URL being indexed. + By default all outlinks are indexed. If db.ignore.internal.links is true (default + value), this setting does nothing since the internal links are already + ignored. + </description> +</property> + +<property> + <name>index.links.inlinks.host.ignore</name> + <value>false</value> + <description> + Ignore inlinks coming from the same host as the URL being indexed. By default + all inlinks are indexed. If db.ignore.internal.links is true (default + value), this setting does nothing since the internal links are already + ignored. + </description> +</property> + +<property> + <name>index.links.hosts.only</name> + <value>false</value> + <description> + This force the index-links plugin to only index the host portion of the inlinks + or outlinks. + </description> +</property> + </configuration> Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1710033&r1=1710032&r2=1710033&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Thu Oct 22 15:26:34 2015 @@ -36,6 +36,7 @@ <ant dir="index-replace" target="deploy"/> <ant dir="index-static" target="deploy"/> <ant dir="index-metadata" target="deploy"/> + <ant dir="index-links" target="deploy"/> <ant dir="mimetype-filter" target="deploy"/> <ant dir="indexer-cloudsearch" target="deploy"/> <ant dir="indexer-dummy" target="deploy"/> @@ -98,6 +99,7 @@ <ant dir="index-more" target="test"/> <ant dir="index-static" target="test"/> <ant dir="index-replace" target="test"/> + <ant dir="index-links" target="test"/> <ant dir="mimetype-filter" target="test"/> <ant dir="language-identifier" target="test"/> <ant dir="lib-http" target="test"/> @@ -143,6 +145,7 @@ <ant dir="index-static" target="clean"/> <ant dir="index-replace" target="clean"/> <ant dir="index-metadata" target="clean"/> + <ant dir="index-links" target="clean"/> <ant dir="mimetype-filter" target="clean"/> <ant dir="indexer-cloudsearch" target="clean"/> <ant dir="indexer-dummy" target="clean"/> Added: nutch/trunk/src/plugin/index-links/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/build.xml?rev=1710033&view=auto ============================================================================== --- nutch/trunk/src/plugin/index-links/build.xml (added) +++ nutch/trunk/src/plugin/index-links/build.xml Thu Oct 22 15:26:34 2015 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="index-links" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> Added: nutch/trunk/src/plugin/index-links/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/ivy.xml?rev=1710033&view=auto ============================================================================== --- nutch/trunk/src/plugin/index-links/ivy.xml (added) +++ nutch/trunk/src/plugin/index-links/ivy.xml Thu Oct 22 15:26:34 2015 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/index-links/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/plugin.xml?rev=1710033&view=auto ============================================================================== --- nutch/trunk/src/plugin/index-links/plugin.xml (added) +++ nutch/trunk/src/plugin/index-links/plugin.xml Thu Oct 22 15:26:34 2015 @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="index-links" + name="Index inlinks and outlinks" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="index-links.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter" + name="Links indexing filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter" + class="org.apache.nutch.indexer.links.LinksIndexingFilter"/> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java?rev=1710033&view=auto ============================================================================== --- nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java (added) +++ nutch/trunk/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java Thu Oct 22 15:26:34 2015 @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds + * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. + * + * In case that you want to ignore the outlinks that point to the same host + * as the URL being indexed use the following settings in your configuration + * file: + * + * <property> + * <name>outlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * The same configuration is available for inlinks: + * + * <property> + * <name>inlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * To store only the host portion of each inlink URL or outlink URL add the + * following to your configuration file. + * + * <property> + * <name>links.hosts.only</name> + * <value>false</value> + * </property> + * + */ +public class LinksIndexingFilter implements IndexingFilter { + + public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore"; + public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore"; + public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only"; + + public final static org.slf4j.Logger LOG = LoggerFactory + .getLogger(LinksIndexingFilter.class); + + private Configuration conf; + private boolean filterOutlinks; + private boolean filterInlinks; + private boolean indexHost; + + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // Add the outlinks + Outlink[] outlinks = parse.getData().getOutlinks(); + + if (outlinks != null) { + Set<String> hosts = new HashSet<String>(); + + for (Outlink outlink : outlinks) { + try { + String linkUrl = outlink.getToUrl(); + String outHost = new URL(linkUrl).getHost().toLowerCase(); + + if (indexHost) { + linkUrl = outHost; + + if (hosts.contains(linkUrl)) + continue; + + hosts.add(linkUrl); + } + + addFilteredLink("outlinks", url.toString(), linkUrl, outHost, + filterOutlinks, doc); + } catch (MalformedURLException e) { + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); + } + } + } + + // Add the inlinks + if (null != inlinks) { + Iterator<Inlink> iterator = inlinks.iterator(); + Set<String> inlinkHosts = new HashSet<String>(); + + while (iterator.hasNext()) { + try { + Inlink link = iterator.next(); + String linkUrl = link.getFromUrl(); + String inHost = new URL(linkUrl).getHost().toLowerCase(); + + if (indexHost) { + linkUrl = inHost; + + if (inlinkHosts.contains(linkUrl)) + continue; + + inlinkHosts.add(linkUrl); + } + + addFilteredLink("inlinks", url.toString(), linkUrl, inHost, + filterInlinks, doc); + } catch (MalformedURLException e) { + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); + } + } + } + + return doc; + } + + private void addFilteredLink(String fieldName, String url, String linkUrl, + String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException { + if (filter) { + String host = new URL(url.toString()).getHost().toLowerCase(); + + if (!host.equalsIgnoreCase(urlHost)) { + doc.add(fieldName, linkUrl); + } + } else { + doc.add(fieldName, linkUrl); + } + } + + public void setConf(Configuration conf) { + this.conf = conf; + filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false); + filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false); + + indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false); + } + + public Configuration getConf() { + return this.conf; + } +} Added: nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java?rev=1710033&view=auto ============================================================================== --- nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java (added) +++ nutch/trunk/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java Thu Oct 22 15:26:34 2015 @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.indexer.NutchField; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.net.URL; +import java.util.Iterator; + +public class TestLinksIndexingFilter { + + Configuration conf = NutchConfiguration.create(); + LinksIndexingFilter filter = new LinksIndexingFilter(); + Metadata metadata = new Metadata(); + + @Before + public void setUp() throws Exception { + metadata.add(Response.CONTENT_TYPE, "text/html"); + } + + private Outlink[] generateOutlinks() throws Exception { + return generateOutlinks(false); + } + + private Outlink[] generateOutlinks(boolean parts) throws Exception { + Outlink[] outlinks = new Outlink[2]; + + outlinks[0] = new Outlink("http://www.test.com", "test"); + outlinks[1] = new Outlink("http://www.example.com", "example"); + + if (parts) { + outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1", + "test"); + outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2", + "test"); + } + + return outlinks; + } + + @Test + public void testFilterOutlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals("Filter outlinks, allow only those from a different host", + outlinks[0].getToUrl(), doc.getFieldValue("outlinks")); + } + + @Test + public void testFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Filter inlinks, allow only those from a different host", + "http://www.test.com", doc.getFieldValue("inlinks")); + } + + @Test + public void testNoFilterOutlinks() throws Exception { + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals("All outlinks must be indexed even those from the same host", + outlinks.length, doc.getField("outlinks").getValues().size()); + } + + @Test + public void testNoFilterInlinks() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false"); + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals("All inlinks must be indexed even those from the same host", + inlinks.size(), doc.getField("inlinks").getValues().size()); + } + + @Test + public void testIndexOnlyHostPart() throws Exception { + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + filter.setConf(conf); + + Outlink[] outlinks = generateOutlinks(true); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test")); + inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test")); + inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", + "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + NutchField docOutlinks = doc.getField("outlinks"); + + Assert.assertEquals("Only the host portion of the outlink URL must be indexed", + new URL("http://www.test.com").getHost(), + docOutlinks.getValues().get(0)); + + Assert.assertEquals( + "The inlinks coming from the same host must count only once", 1, + doc.getField("inlinks").getValues().size()); + + Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", + new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterOutlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true"); + + Outlink[] outlinks = generateOutlinks(true); + + filter.setConf(conf); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", outlinks, metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertEquals(1, doc.getField("outlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the outlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("outlinks")); + } + + @Test + public void testIndexHostsOnlyAndFilterInlinks() throws Exception { + conf = NutchConfiguration.create(); + conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true"); + conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true"); + + filter.setConf(conf); + + Inlinks inlinks = new Inlinks(); + inlinks.add(new Inlink("http://www.test.com", "test")); + inlinks.add(new Inlink("http://www.example.com", "example")); + + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), + new Text("http://www.example.com/"), new CrawlDatum(), inlinks); + + Assert.assertEquals(1, doc.getField("inlinks").getValues().size()); + + Assert.assertEquals( + "Index only the host portion of the inlinks after filtering", + new URL("http://www.test.com").getHost(), + doc.getFieldValue("inlinks")); + + } +} \ No newline at end of file