Author: siren Date: Tue Sep 19 09:01:34 2006 New Revision: 447893 URL: http://svn.apache.org/viewvc?view=rev&rev=447893 Log: NUTCH-105 - Network error during robots.txt fetch causes file to beignored, contributed by Greg Kim
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=447893&r1=447892&r2=447893 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Sep 19 09:01:34 2006 @@ -23,6 +23,9 @@ 8. NUTCH-338 - Remove the text parser as an option for parsing PDF files in parse-plugins.xml (Chris A. Mattmann via siren) + + 9. NUTCH-105 - Network error during robots.txt fetch causes file to + be ignored (Greg Kim via siren) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=447893&r1=447892&r2=447893 ============================================================================== --- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Sep 19 09:01:34 2006 @@ -420,6 +420,8 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); + boolean cacheRule = true; + if (robotRules == null) { // cache miss if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); } try { @@ -430,16 +432,22 @@ robotRules = parseRules(response.getContent()); else if ( (response.getCode() == 403) && (!allowForbidden) ) robotRules = FORBID_ALL_RULES; // use forbid all - else + else if (response.getCode() >= 500) { + cacheRule = false; + robotRules = EMPTY_RULES; + }else robotRules = EMPTY_RULES; // use default rules } catch (Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); } + cacheRule = false; robotRules = EMPTY_RULES; } - CACHE.put(host, robotRules); // cache rules for host + if (cacheRule){ + CACHE.put(host, robotRules); // cache rules for host + } } return robotRules; }