Ignore robots.txt when parsing segment, refactored storing of robots.txt in FetcherThread
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/264eea01 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/264eea01 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/264eea01 Branch: refs/heads/master Commit: 264eea01a4d868578dcf641d6ce405444d276929 Parents: 6c9cca5 Author: Sebastian Nagel <sna...@apache.org> Authored: Fri Aug 19 15:06:14 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Fri Aug 19 15:06:14 2016 +0200 ---------------------------------------------------------------------- .../org/apache/nutch/fetcher/FetcherThread.java | 20 ++++++++++++++------ .../org/apache/nutch/parse/ParseSegment.java | 11 +++++++---- 2 files changed, 21 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/fetcher/FetcherThread.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index cac16ff..6024b8d 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -264,12 +264,7 @@ public class FetcherThread extends Thread { .toString()); BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { - for (Content robotsTxt : robotsTxtContent) { - LOG.debug("fetched and stored robots.txt {}", - robotsTxt.getUrl()); - output.collect(new Text(robotsTxt.getUrl()), - new NutchWritable(robotsTxt)); - } + outputRobotsTxt(robotsTxtContent); robotsTxtContent.clear(); } if (!rules.isAllowed(fit.u.toString())) { @@ -758,6 +753,19 @@ public class FetcherThread extends Thread { return null; } + private void outputRobotsTxt(List<Content> robotsTxtContent) { + for (Content robotsTxt : robotsTxtContent) { + LOG.debug("fetched and stored robots.txt {}", + robotsTxt.getUrl()); + try { + output.collect(new Text(robotsTxt.getUrl()), + new NutchWritable(robotsTxt)); + } catch (IOException e) { + LOG.error("fetcher caught: {}", e.toString()); + } + } + } + private void updateStatus(int bytesInPage) throws IOException { pages.incrementAndGet(); bytes.addAndGet(bytesInPage); http://git-wip-us.apache.org/repos/asf/nutch/blob/264eea01/src/java/org/apache/nutch/parse/ParseSegment.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index b008bed..1a0da90 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -84,11 +84,14 @@ public class ParseSegment extends NutchTool implements Tool, key = newKey; } - int status = Integer.parseInt(content.getMetadata().get( - Nutch.FETCH_STATUS_KEY)); - if (status != CrawlDatum.STATUS_FETCH_SUCCESS) { + String fetchStatus = content.getMetadata().get(Nutch.FETCH_STATUS_KEY); + if (fetchStatus == null) { + // no fetch status, skip document + LOG.debug("Skipping {} as content has no fetch status", key); + return; + } else if (Integer.parseInt(fetchStatus) != CrawlDatum.STATUS_FETCH_SUCCESS) { // content not fetched successfully, skip document - LOG.debug("Skipping " + key + " as content is not fetched successfully"); + LOG.debug("Skipping {} as content is not fetched successfully", key); return; }