Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Wed Apr 22 01:46:28 2015 @@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.protocol.RobotRulesParser; import crawlercommons.robots.BaseRobotRules; +import crawlercommons.robots.SimpleRobotRules; /** * This class is used for parsing robots for urls belonging to HTTP protocol. It @@ -86,31 +87,16 @@ public class HttpRobotRulesParser extend */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { - if (LOG.isTraceEnabled() && isWhiteListed(url)) { - LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); - } - String cacheKey = getCacheKey(url); - BaseRobotRules robotRules = CACHE.get(cacheKey); - - if (robotRules != null) { - return robotRules; // cached rule - } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss " + url); - } + BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; - URL redir = null; - - if (isWhiteListed(url)) { - // check in advance whether a host is whitelisted - // (we do not need to fetch robots.txt) - robotRules = EMPTY_RULES; - LOG.info("Whitelisted host found for: {}", url); - LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", - url.getHost()); - } else { + if (robotRules == null) { // cache miss + URL redir = null; + if (LOG.isTraceEnabled()) { + LOG.trace("cache miss " + url); + } try { Response response = ((HttpBase) http).getResponse(new URL(url, "/robots.txt"), new CrawlDatum(), true); @@ -141,7 +127,7 @@ public class HttpRobotRulesParser extend else if ((response.getCode() == 403) && (!allowForbidden)) robotRules = FORBID_ALL_RULES; // use forbid all else if (response.getCode() >= 500) { - cacheRule = false; // try again later to fetch robots.txt + cacheRule = false; robotRules = EMPTY_RULES; } else robotRules = EMPTY_RULES; // use default rules @@ -149,19 +135,18 @@ public class HttpRobotRulesParser extend if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); } - cacheRule = false; // try again later to fetch robots.txt + cacheRule = false; robotRules = EMPTY_RULES; } - } - if (cacheRule) { - CACHE.put(cacheKey, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { - // cache also for the redirected host - CACHE.put(getCacheKey(redir), robotRules); + if (cacheRule) { + CACHE.put(cacheKey, robotRules); // cache rules for host + if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { + // cache also for the redirected host + CACHE.put(getCacheKey(redir), robotRules); + } } } - return robotRules; } }
Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (original) +++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Wed Apr 22 01:46:28 2015 @@ -69,28 +69,15 @@ public class FtpRobotRulesParser extends // case String host = url.getHost().toLowerCase(); // normalize to lower case - if (LOG.isTraceEnabled() && isWhiteListed(url)) { - LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); - } - - BaseRobotRules robotRules = CACHE.get(protocol + ":" + host); - - if (robotRules != null) { - return robotRules; // cached rule - } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss " + url); - } + BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + + host); boolean cacheRule = true; - if (isWhiteListed(url)) { - // check in advance whether a host is whitelisted - // (we do not need to fetch robots.txt) - robotRules = EMPTY_RULES; - LOG.info("Whitelisted host found for: {}", url); - LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host); + if (robotRules == null) { // cache miss + if (LOG.isTraceEnabled()) + LOG.trace("cache miss " + url); - } else { try { Text robotsUrl = new Text(new URL(url, "/robots.txt").toString()); ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, @@ -107,15 +94,13 @@ public class FtpRobotRulesParser extends if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); } - cacheRule = false; // try again later to fetch robots.txt + cacheRule = false; robotRules = EMPTY_RULES; } + if (cacheRule) + CACHE.put(protocol + ":" + host, robotRules); // cache rules for host } - - if (cacheRule) - CACHE.put(protocol + ":" + host, robotRules); // cache rules for host - return robotRules; } }