Author: cutting
Date: Thu Apr 20 12:18:56 2006
New Revision: 395676

URL: http://svn.apache.org/viewcvs?rev=395676&view=rev
Log:
Fix NUTCH-108.  Log hosts that exceed generate.max.per.host.  Contributed by 
Rod Taylor.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/CHANGES.txt?rev=395676&r1=395675&r2=395676&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Apr 20 12:18:56 2006
@@ -4,6 +4,10 @@
 
  1. NUTCH-107 - Typo in plugin/urlfilter-*/plugin.xml. (Stephen Cross).
 
+ 2. NUTCH-108 - Log hosts that exceed generate.max.per.host.
+   (Rod Taylor via cutting)
+
+
 Release 0.7 - 2005-08-17
 
  1. Added support for "type:" in queries. Search results are limited/qualified

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=395676&r1=395675&r2=395676&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Apr 
20 12:18:56 2006
@@ -127,12 +127,18 @@
         if (maxPerHost > 0) {                     // are we counting hosts?
           String host = new URL(url.toString()).getHost();
           Integer hostCount = (Integer)hostCounts.get(host);
-          if (hostCount != null) {
-            if (hostCount.intValue() >= maxPerHost)
-              continue;                           // too many from host
-            hostCounts.put(host, new Integer(hostCount.intValue()+1));
-          } else {                                // update host count
-            hostCounts.put(host, new Integer(1));
+
+          // increment hostCount
+          hostCount = new Integer(hostCount==null ? 1 : 
hostCount.intValue()+1);
+          hostCounts.put(host, hostCount);
+
+          // skip URL if above the limit per host.
+          if (hostCount.intValue() > maxPerHost) {
+            if (hostCount.intValue() == maxPerHost + 1) {
+              LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+
+                       " Skipping additional.");
+            }
+            continue;
           }
         }
 


Reply via email to