Author: olegk
Date: Sun Aug 8 12:05:56 2010
New Revision: 983394
URL: http://svn.apache.org/viewvc?rev=983394&view=rev
Log:
Fixed NPE in CrawlingWorker; improved exception handling and logging in the
HTTP transport
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/DroidsRequestRetryHandler.java
Sun Aug 8 12:05:56 2010
@@ -18,13 +18,13 @@ package org.apache.droids.protocol.http;
import java.io.IOException;
import java.io.InterruptedIOException;
-import java.net.ConnectException;
-import java.net.UnknownHostException;
+ import java.net.UnknownHostException;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
+import org.apache.http.conn.HttpHostConnectException;
import org.apache.http.protocol.HttpContext;
class DroidsRequestRetryHandler implements HttpRequestRetryHandler
@@ -68,7 +68,7 @@ class DroidsRequestRetryHandler implemen
// Unknown host
return false;
}
- if (exception instanceof ConnectException) {
+ if (exception instanceof HttpHostConnectException) {
// Connection refused
return false;
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/HttpProtocol.java
Sun Aug 8 12:05:56 2010
@@ -76,7 +76,7 @@ public class HttpProtocol extends Loggab
return new HttpContentEntity(entity, maxlen);
}
- public boolean isAllowed(URI uri) {
+ public boolean isAllowed(URI uri) throws IOException {
if (forceAllow) {
return forceAllow;
}
@@ -105,14 +105,10 @@ public class HttpProtocol extends Loggab
} catch (NoRobotException ex) {
log.error("Failure parsing robots.txt: " + ex.getMessage());
return false;
- } catch (IOException ex) {
- log.error("I/O error parsing robots.txt: " + ex.getMessage());
- return false;
}
boolean test = nrc.isUrlAllowed(uri);
- String message = (test) ? "allowed" : "denied";
if (log.isInfoEnabled()) {
- log.info("Url is " + message);
+ log.info(uri + " is " + (test ? "allowed" : "denied"));
}
return test;
}
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
---
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
(original)
+++
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/robot/crawler/CrawlingWorker.java
Sun Aug 8 12:05:56 2010
@@ -52,6 +52,13 @@ public class CrawlingWorker extends Logg
}
URI uri = link.getURI();
final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
+ if (protocol == null) {
+ if (log.isWarnEnabled()) {
+ log.warn("Unsupported protocol scheme '" + uri.getScheme() + "'");
+ }
+ return;
+ }
+
if (protocol.isAllowed(uri)) {
if (log.isInfoEnabled()) {
log.info("Loading " + uri);
@@ -87,8 +94,10 @@ public class CrawlingWorker extends Logg
}
}
else {
- log.info("Stopping processing since"
- + " bots are not allowed for this url.");
+ if (log.isInfoEnabled()) {
+ log.info("Stopping processing since"
+ + " bots are not allowed for " + uri );
+ }
}
}
Modified:
incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt?rev=983394&r1=983393&r2=983394&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt
(original)
+++ incubator/droids/trunk/droids-core/src/main/resources/regex-urlfilter.txt
Sun Aug 8 12:05:56 2010
@@ -23,7 +23,7 @@
# matches, the URL is ignored.
# skip file: ftp: and mailto: urls
--^(ftp|mailto):
+-^(ftp|mailto|irc):
# skip URLs containing certain characters as probable queries, etc.
-...@#]