Author: lewismc
Date: Wed May  6 23:32:39 2015
New Revision: 1678111

URL: http://svn.apache.org/r1678111
Log:
NUTCH-2004 ParseChecker does not handle redirects

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May  6 23:32:39 2015
@@ -2,7 +2,7 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
-* NUTCH-XX
+* NUTCH-2004 ParseChecker does not handle redirects (mjoyce via lewismc)
 
 Nutch 1.10 Release - 29/04/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch10

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May  6 
23:32:39 2015
@@ -34,6 +34,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.URLUtil;
@@ -135,9 +136,30 @@ public class ParserChecker implements To
     Text turl = new Text(url);
     ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
 
+    // If the configuration permits, handle redirects until we either run
+    // out of allowed redirects or we stop getting redirect statuses.
+    int maxRedirects = conf.getInt("http.redirect.max", 0);
+    int numRedirects = 0;
+    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
+        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
+        LOG.info("Handling redirect to " + newURL);
+
+        protocol = factory.getProtocol(newURL);
+        turl = new Text(newURL);
+        output = protocol.getProtocolOutput(turl, cd);
+
+        numRedirects++;
+    }
+
     if (!output.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + output.getStatus());
+
+      if (output.getStatus().isRedirect()) {
+          System.err.println("Redirect(s) not handled due to configuration.");
+          System.err.println("Max Redirects to handle per config: " + 
maxRedirects);
+          System.err.println("Number of Redirects handled: " + numRedirects);
+      }
       return (-1);
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Wed May  
6 23:32:39 2015
@@ -227,6 +227,10 @@ public class ProtocolStatus implements W
         || code == ROBOTS_DENIED;
   }
 
+  public boolean isRedirect() {
+      return code == MOVED || code == TEMP_MOVED;
+  }
+
   public String getMessage() {
     if (args != null && args.length > 0)
       return args[0];


Reply via email to