[2/5] nutch git commit: Ignore robots.txt when parsing segment, refactored storing of robots.txt in FetcherThread

snagel Mon, 22 Aug 2016 14:51:42 -0700

Ignore robots.txt when parsing segment, refactored storing of robots.txt in 
FetcherThread



Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/264eea01
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/264eea01
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/264eea01

Branch: refs/heads/master
Commit: 264eea01a4d868578dcf641d6ce405444d276929
Parents: 6c9cca5
Author: Sebastian Nagel <sna...@apache.org>
Authored: Fri Aug 19 15:06:14 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Fri Aug 19 15:06:14 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/fetcher/FetcherThread.java | 20 ++++++++++++++------
 .../org/apache/nutch/parse/ParseSegment.java    | 11 +++++++----
 2 files changed, 21 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/fetcher/FetcherThread.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index cac16ff..6024b8d 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -264,12 +264,7 @@ public class FetcherThread extends Thread {
                 .toString());
             BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, 
robotsTxtContent);
             if (robotsTxtContent != null) {
-              for (Content robotsTxt : robotsTxtContent) {
-                LOG.debug("fetched and stored robots.txt {}",
-                    robotsTxt.getUrl());
-                output.collect(new Text(robotsTxt.getUrl()),
-                    new NutchWritable(robotsTxt));
-              }
+              outputRobotsTxt(robotsTxtContent);
               robotsTxtContent.clear();
             }
             if (!rules.isAllowed(fit.u.toString())) {
@@ -758,6 +753,19 @@ public class FetcherThread extends Thread {
     return null;
   }
   
+  private void outputRobotsTxt(List<Content> robotsTxtContent) {
+    for (Content robotsTxt : robotsTxtContent) {
+      LOG.debug("fetched and stored robots.txt {}",
+          robotsTxt.getUrl());
+      try {
+        output.collect(new Text(robotsTxt.getUrl()),
+            new NutchWritable(robotsTxt));
+      } catch (IOException e) {
+        LOG.error("fetcher caught: {}", e.toString());
+      }
+    }
+  }
+
   private void updateStatus(int bytesInPage) throws IOException {
     pages.incrementAndGet();
     bytes.addAndGet(bytesInPage);

http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/parse/ParseSegment.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java 
b/src/java/org/apache/nutch/parse/ParseSegment.java
index b008bed..1a0da90 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -84,11 +84,14 @@ public class ParseSegment extends NutchTool implements Tool,
       key = newKey;
     }
 
-    int status = Integer.parseInt(content.getMetadata().get(
-        Nutch.FETCH_STATUS_KEY));
-    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
+    String fetchStatus = content.getMetadata().get(Nutch.FETCH_STATUS_KEY);
+    if (fetchStatus == null) {
+      // no fetch status, skip document
+      LOG.debug("Skipping {} as content has no fetch status", key);
+      return;
+    } else if (Integer.parseInt(fetchStatus) != 
CrawlDatum.STATUS_FETCH_SUCCESS) {
       // content not fetched successfully, skip document
-      LOG.debug("Skipping " + key + " as content is not fetched successfully");
+      LOG.debug("Skipping {} as content is not fetched successfully", key);
       return;
     }

[2/5] nutch git commit: Ignore robots.txt when parsing segment, refactored storing of robots.txt in FetcherThread

Reply via email to