This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new e6bc451 NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed minimum delay is configured by `fetcher.min.crawl.delay` (default set equal to `fetcher.server.delay`) new e6d3e57 Merge pull request #506 from sebastian-nagel/NUTCH-2775-robots-min-delay e6bc451 is described below commit e6bc45181369ced98fa7d9df23685620938b0c9c Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Wed Mar 25 10:32:36 2020 +0100 NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed minimum delay is configured by `fetcher.min.crawl.delay` (default set equal to `fetcher.server.delay`) --- conf/nutch-default.xml | 12 ++++++++++++ src/java/org/apache/nutch/fetcher/FetcherThread.java | 17 ++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 85d9933..6dfbe64 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -959,6 +959,18 @@ </property> <property> + <name>fetcher.min.crawl.delay</name> + <value>${fetcher.server.delay}</value> + <description> + Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the + robots.txt specifies a shorter delay. By default the minimum Crawl-Delay + is set to the value of `fetcher.server.delay` which guarantees that + a value set in the robots.txt cannot make the crawler more aggressive + than the default configuration. + </description> +</property> + +<property> <name>fetcher.threads.fetch</name> <value>10</value> <description>The number of FetcherThreads the fetcher should use. diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 5d5a20b..549cd36 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -85,6 +85,7 @@ public class FetcherThread extends Thread { private URLNormalizers normalizers; private ProtocolFactory protocolFactory; private long maxCrawlDelay; + private long minCrawlDelay; private String queueMode; private int maxRedirect; private boolean maxRedirectExceededSkip = false; @@ -165,6 +166,9 @@ public class FetcherThread extends Thread { this.protocolFactory = new ProtocolFactory(conf); this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000; + float crawlDelay = conf.getFloat("fetcher.server.delay", 1.0f); + this.minCrawlDelay = (long) (conf.getFloat("fetcher.min.crawl.delay", + crawlDelay) * 1000); this.activeThreads = activeThreads; this.fetchQueues = fetchQueues; this.feeder = feeder; @@ -324,8 +328,8 @@ public class FetcherThread extends Thread { if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { // unblock fetchQueues.finishFetchItem(fit, true); - LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url, - rules.getCrawlDelay()); + LOG.info("Crawl-Delay for {} too long ({} ms), skipping", + fit.url, rules.getCrawlDelay()); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); @@ -334,7 +338,14 @@ public class FetcherThread extends Thread { continue; } else { FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID); - fiq.crawlDelay = rules.getCrawlDelay(); + long crawlDelay = rules.getCrawlDelay(); + if (crawlDelay < minCrawlDelay) { + LOG.info( + "Crawl-Delay for {} too short ({} ms), adjusting to {} ms", + fit.url, rules.getCrawlDelay(), minCrawlDelay); + crawlDelay = minCrawlDelay; + } + fiq.crawlDelay = crawlDelay; if (LOG.isDebugEnabled()) { LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay