Author: lewismc Date: Wed May 6 23:32:39 2015 New Revision: 1678111 URL: http://svn.apache.org/r1678111 Log: NUTCH-2004 ParseChecker does not handle redirects
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678111&r1=1678110&r2=1678111&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed May 6 23:32:39 2015 @@ -2,7 +2,7 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT -* NUTCH-XX +* NUTCH-2004 ParseChecker does not handle redirects (mjoyce via lewismc) Nutch 1.10 Release - 29/04/2015 (dd/mm/yyyy) Release Report: http://s.apache.org/nutch10 Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1678111&r1=1678110&r2=1678111&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May 6 23:32:39 2015 @@ -34,6 +34,7 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.URLUtil; @@ -135,9 +136,30 @@ public class ParserChecker implements To Text turl = new Text(url); ProtocolOutput output = protocol.getProtocolOutput(turl, cd); + // If the configuration permits, handle redirects until we either run + // out of allowed redirects or we stop getting redirect statuses. + int maxRedirects = conf.getInt("http.redirect.max", 0); + int numRedirects = 0; + while (output.getStatus().isRedirect() && numRedirects < maxRedirects) { + String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]); + LOG.info("Handling redirect to " + newURL); + + protocol = factory.getProtocol(newURL); + turl = new Text(newURL); + output = protocol.getProtocolOutput(turl, cd); + + numRedirects++; + } + if (!output.getStatus().isSuccess()) { System.err.println("Fetch failed with protocol status: " + output.getStatus()); + + if (output.getStatus().isRedirect()) { + System.err.println("Redirect(s) not handled due to configuration."); + System.err.println("Max Redirects to handle per config: " + maxRedirects); + System.err.println("Number of Redirects handled: " + numRedirects); + } return (-1); } Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=1678111&r1=1678110&r2=1678111&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original) +++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Wed May 6 23:32:39 2015 @@ -227,6 +227,10 @@ public class ProtocolStatus implements W || code == ROBOTS_DENIED; } + public boolean isRedirect() { + return code == MOVED || code == TEMP_MOVED; + } + public String getMessage() { if (args != null && args.length > 0) return args[0];