Author: cutting Date: Thu Apr 20 12:18:56 2006 New Revision: 395676 URL: http://svn.apache.org/viewcvs?rev=395676&view=rev Log: Fix NUTCH-108. Log hosts that exceed generate.max.per.host. Contributed by Rod Taylor.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/CHANGES.txt?rev=395676&r1=395675&r2=395676&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Apr 20 12:18:56 2006 @@ -4,6 +4,10 @@ 1. NUTCH-107 - Typo in plugin/urlfilter-*/plugin.xml. (Stephen Cross). + 2. NUTCH-108 - Log hosts that exceed generate.max.per.host. + (Rod Taylor via cutting) + + Release 0.7 - 2005-08-17 1. Added support for "type:" in queries. Search results are limited/qualified Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=395676&r1=395675&r2=395676&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Apr 20 12:18:56 2006 @@ -127,12 +127,18 @@ if (maxPerHost > 0) { // are we counting hosts? String host = new URL(url.toString()).getHost(); Integer hostCount = (Integer)hostCounts.get(host); - if (hostCount != null) { - if (hostCount.intValue() >= maxPerHost) - continue; // too many from host - hostCounts.put(host, new Integer(hostCount.intValue()+1)); - } else { // update host count - hostCounts.put(host, new Integer(1)); + + // increment hostCount + hostCount = new Integer(hostCount==null ? 1 : hostCount.intValue()+1); + hostCounts.put(host, hostCount); + + // skip URL if above the limit per host. + if (hostCount.intValue() > maxPerHost) { + if (hostCount.intValue() == maxPerHost + 1) { + LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+ + " Skipping additional."); + } + continue; } }