[ https://issues.apache.org/jira/browse/NUTCH-2139?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14967783#comment-14967783 ]
ASF GitHub Bot commented on NUTCH-2139: --------------------------------------- Github user sebastian-nagel commented on a diff in the pull request: https://github.com/apache/nutch/pull/78#discussion_r42675133 --- Diff: src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java --- @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer.links; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that adds + * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. + * + * In case that you want to ignore the outlinks that point to the same host + * as the URL being indexed use the following settings in your configuration + * file: + * + * <property> + * <name>outlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * The same configuration is available for inlinks: + * + * <property> + * <name>inlinks.host.ignore</name> + * <value>true</value> + * </property> + * + * To store only the host portion of each inlink URL or outlink URL add the + * following to your configuration file. + * + * <property> + * <name>links.hosts.only</name> + * <value>false</value> + * </property> + * + */ +public class LinksIndexingFilter implements IndexingFilter { + + public final static String LINKS_OUTLINKS_HOST = "outlinks.host.ignore"; + public final static String LINKS_INLINKS_HOST = "inlinks.host.ignore"; + public final static String LINKS_ONLY_HOSTS = "links.hosts.only"; + + public final static org.slf4j.Logger LOG = LoggerFactory + .getLogger(LinksIndexingFilter.class); + + private Configuration conf; + private boolean filterOutlinks; + private boolean filterInlinks; + private boolean indexHost; + + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // Add the outlinks + Outlink[] outlinks = parse.getData().getOutlinks(); + + try { + if (outlinks != null) { + Set<String> hosts = new HashSet<String>(); + + for (Outlink outlink : outlinks) { + String linkUrl = outlink.getToUrl(); + String outHost = new URL(linkUrl).getHost(); + + if (indexHost) { + linkUrl = new URL(outlink.getToUrl()).getHost(); + + if (hosts.contains(linkUrl)) + continue; + + hosts.add(linkUrl); + } + + addFilteredLink("outlinks", url.toString(), linkUrl, outHost, + filterOutlinks, doc); + } + } + } catch (MalformedURLException e) { + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); + } + + // Add the inlinks + if (null != inlinks) { + Iterator<Inlink> iterator = inlinks.iterator(); + Set<String> inlinkHosts = new HashSet<String>(); + + try { + while (iterator.hasNext()) { --- End diff -- +1 great! On 10/15/2015 09:20 PM, Jorge Luis Betancourt wrote: > In src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java > <https://github.com/apache/nutch/pull/78#discussion_r42168120>: > >> + >> + addFilteredLink("outlinks", url.toString(), linkUrl, outHost, >> + filterOutlinks, doc); >> + } >> + } >> + } catch (MalformedURLException e) { >> + LOG.error("Malformed URL in {}: {}", url, e.getMessage()); >> + } >> + >> + // Add the inlinks >> + if (null != inlinks) { >> + Iterator<Inlink> iterator = inlinks.iterator(); >> + Set<String> inlinkHosts = new HashSet<String>(); >> + >> + try { >> + while (iterator.hasNext()) { > > I though the same, since the URL is already is fetched shouldn't be any trouble, but its an easy fix > so I can put it inside the while loop. > > — > Reply to this email directly or view it on GitHub > <https://github.com/apache/nutch/pull/78/files#r42168120>. > > Basic plugin to index inlinks and outlinks > ------------------------------------------ > > Key: NUTCH-2139 > URL: https://issues.apache.org/jira/browse/NUTCH-2139 > Project: Nutch > Issue Type: Improvement > Components: indexer, plugin > Reporter: Jorge Luis Betancourt Gonzalez > Priority: Minor > Labels: link, plugin > Fix For: 1.12 > > > Basic plugin that allows to index the inlinks and outlinks of the web pages, > this could be very useful for analytic purposes, including neat > visualizations using d3.js for instance. -- This message was sent by Atlassian JIRA (v6.3.4#6332)