Hey,

Here's a patch that'll allow users to configure how many threads they want to access the same host at the same time. Right Nutch only allows one thread at a time to access any given host. The default will still be 1 thread per host.

The somewhat fuzzy part of this is that it will wait the fetcher.server.delay only when it pops off the last thread accessing a host. With 1 thread per host this results in identical behavior as currently.

Let me know what you think. I've tested it a little and seems to work as it is supposed to.

Thanks,

Luke Baker
diff -Nur nutch/conf/nutch-default.xml nutch-changed/conf/nutch-default.xml
--- nutch/conf/nutch-default.xml	2004-12-01 13:26:20.000000000 -0500
+++ nutch-changed/conf/nutch-default.xml	2004-12-01 13:27:53.561158643 -0500
@@ -275,6 +275,13 @@
 </property>
 
 <property>
+  <name>threads.per.host</name>
+  <value>1</value>
+  <description>This number is the maximum number of threads that
+    should be allowed to access a host at one time.</description>
+</property>
+
+<property>
   <name>fetcher.retry.max</name>
   <value>3</value>
   <description>The maximum number of times the fetcher will attempt to get 
diff -Nur nutch/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java nutch-changed/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java
--- nutch/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java	2004-06-28 17:26:39.000000000 -0400
+++ nutch-changed/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java	2004-12-01 13:29:22.721751381 -0500
@@ -41,6 +41,7 @@
   static int MAX_CONTENT= NutchConf.getInt("http.content.limit",64*1024);
 
   static int MAX_DELAYS= NutchConf.getInt("http.max.delays",3);
+  static int MAX_THREADS_PER_HOST = NutchConf.getInt("threads.per.host", 1);
 
   static String AGENT_STRING = getAgentString();
 
@@ -64,6 +65,9 @@
    * a request finishes.  This way only one thread at a time accesses an
    * address. */
   private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap();
+    
+  /** Maps an address to the number of threads that are accessing that address. */
+  private static HashMap THREADS_PER_HOST_COUNT = new HashMap();
 
   /** Queue of blocked InetAddress.  This contains all of the non-zero entries
    * from BLOCKED_ADDR_TO_TIME, ordered by increasing time. */
@@ -84,11 +88,25 @@
       cleanExpiredServerBlocks();                 // free held addresses
 
       Long time;
+      Integer addrCount;
+      
       synchronized (BLOCKED_ADDR_TO_TIME) {
         time = (Long) BLOCKED_ADDR_TO_TIME.get(addr);
         if (time == null) {                       // address is free
-          BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
-          return addr;
+          synchronized (THREADS_PER_HOST_COUNT) {
+            addrCount = (Integer)THREADS_PER_HOST_COUNT.get(addr);  // get # of threads currently accessing this addr
+            if (addrCount == null || addrCount.intValue() == 0) { // no threads accessing yet
+              THREADS_PER_HOST_COUNT.put(addr, new Integer(1));
+            }
+            else {
+              THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount.intValue() + 1));
+            }
+            addrCount = (Integer)THREADS_PER_HOST_COUNT.get(addr);
+            if (addrCount.intValue() >= MAX_THREADS_PER_HOST) {
+              BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
+            }
+            return addr;
+          }
         }
       }
 
@@ -129,9 +147,18 @@
 
   private static void unblockAddr(InetAddress addr) {
     synchronized (BLOCKED_ADDR_TO_TIME) {
-      BLOCKED_ADDR_QUEUE.addFirst(addr);
-      BLOCKED_ADDR_TO_TIME.put
-        (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+      synchronized (THREADS_PER_HOST_COUNT) {
+        int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
+        if (addrCount == 1) {
+          THREADS_PER_HOST_COUNT.remove(addr);
+          BLOCKED_ADDR_QUEUE.addFirst(addr);
+          BLOCKED_ADDR_TO_TIME.put
+            (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+        }
+        else {
+          THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
+        }
+      }
     }
   }
 

Reply via email to