Author: siren
Date: Tue Sep 19 07:52:37 2006
New Revision: 447867

URL: http://svn.apache.org/viewvc?view=rev&rev=447867
Log:
NUTCH-105 - Network error during robots.txt fetch causes file to beignored, 
contributed by Greg Kim

Modified:
    lucene/nutch/branches/branch-0.8/CHANGES.txt
    
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Tue Sep 19 07:52:37 2006
@@ -22,6 +22,9 @@
  7. NUTCH-338 - Remove the text parser as an option for parsing PDF files
     in parse-plugins.xml (Chris A. Mattmann via siren)
 
+ 8. NUTCH-105 - Network error during robots.txt fetch causes file to
+    beignored (Greg Kim via siren)
+    
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: 
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- 
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
 Tue Sep 19 07:52:37 2006
@@ -420,6 +420,8 @@
 
     RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
+    boolean cacheRule = true;
+    
     if (robotRules == null) {                     // cache miss
       if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
       try {
@@ -430,16 +432,22 @@
           robotRules = parseRules(response.getContent());
         else if ( (response.getCode() == 403) && (!allowForbidden) )
           robotRules = FORBID_ALL_RULES;            // use forbid all
-        else                                        
+        else if (response.getCode() >= 500) {
+         cacheRule = false;
+         robotRules = EMPTY_RULES;
+        }else                                        
           robotRules = EMPTY_RULES;                 // use default rules
       } catch (Throwable t) {
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
         }
+        cacheRule = false;
         robotRules = EMPTY_RULES;
       }
 
-      CACHE.put(host, robotRules);                // cache rules for host
+      if (cacheRule){
+       CACHE.put(host, robotRules);  // cache rules for host
+      }
     }
     return robotRules;
   }


Reply via email to