[ https://issues.apache.org/jira/browse/NUTCH-2139?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14959349#comment-14959349 ]
ASF GitHub Bot commented on NUTCH-2139: --------------------------------------- Github user sebastian-nagel commented on a diff in the pull request: https://github.com/apache/nutch/pull/78#discussion_r42160948 --- Diff: src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java --- @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds + * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. + * + * In case that you want to ignore the outlinks that point to the same host + * as the URL being indexed use the following settings in your configuration + * file: + * + * <property> + * <name>outlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * The same configuration is available for inlinks: + * + * <property> + * <name>inlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * To store only the host portion of each inlink URL or outlink URL add the + * following to your configuration file. + * + * <property> + * <name>links.hosts.only</name> + * <value>false</value> + * </property> + * + */ +public class LinksIndexingFilter implements IndexingFilter { + + public final static String LINKS_OUTLINKS_HOST = "outlinks.host.ignore"; + public final static String LINKS_INLINKS_HOST = "inlinks.host.ignore"; + public final static String LINKS_ONLY_HOSTS = "links.hosts.only"; + + public final static org.slf4j.Logger LOG = LoggerFactory + .getLogger(LinksIndexingFilter.class); + + private Configuration conf; + private boolean filterOutlinks; + private boolean filterInlinks; + private boolean indexHost; + + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // Add the outlinks + Outlink[] outlinks = parse.getData().getOutlinks(); + + try { + if (outlinks != null) { + Set<String> hosts = new HashSet<String>(); + + for (Outlink outlink : outlinks) { + String linkUrl = outlink.getToUrl(); + String outHost = new URL(linkUrl).getHost(); + + if (indexHost) { + linkUrl = new URL(outlink.getToUrl()).getHost(); --- End diff -- now linkurl is equal to outHost - could be simplified > Basic plugin to index inlinks and outlinks > ------------------------------------------ > > Key: NUTCH-2139 > URL: https://issues.apache.org/jira/browse/NUTCH-2139 > Project: Nutch > Issue Type: Improvement > Components: indexer, plugin > Reporter: Jorge Luis Betancourt Gonzalez > Priority: Minor > Labels: link, plugin > Fix For: 1.11 > > > Basic plugin that allows to index the inlinks and outlinks of the web pages, > this could be very useful for analytic purposes, including neat > visualizations using d3.js for instance. -- This message was sent by Atlassian JIRA (v6.3.4#6332)