Hey,
Here's a patch that'll allow users to configure how many threads they want to access the same host at the same time. Right Nutch only allows one thread at a time to access any given host. The default will still be 1 thread per host.
The somewhat fuzzy part of this is that it will wait the fetcher.server.delay only when it pops off the last thread accessing a host. With 1 thread per host this results in identical behavior as currently.
Let me know what you think. I've tested it a little and seems to work as it is supposed to.
Thanks,
Luke Baker
diff -Nur nutch/conf/nutch-default.xml nutch-changed/conf/nutch-default.xml
--- nutch/conf/nutch-default.xml 2004-12-01 13:26:20.000000000 -0500
+++ nutch-changed/conf/nutch-default.xml 2004-12-01 13:27:53.561158643 -0500
@@ -275,6 +275,13 @@
</property>
<property>
+ <name>threads.per.host</name>
+ <value>1</value>
+ <description>This number is the maximum number of threads that
+ should be allowed to access a host at one time.</description>
+</property>
+
+<property>
<name>fetcher.retry.max</name>
<value>3</value>
<description>The maximum number of times the fetcher will attempt to get
diff -Nur nutch/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java nutch-changed/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java
--- nutch/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java 2004-06-28 17:26:39.000000000 -0400
+++ nutch-changed/src/plugin/protocol-http/src/java/net/nutch/protocol/http/Http.java 2004-12-01 13:29:22.721751381 -0500
@@ -41,6 +41,7 @@
static int MAX_CONTENT= NutchConf.getInt("http.content.limit",64*1024);
static int MAX_DELAYS= NutchConf.getInt("http.max.delays",3);
+ static int MAX_THREADS_PER_HOST = NutchConf.getInt("threads.per.host", 1);
static String AGENT_STRING = getAgentString();
@@ -64,6 +65,9 @@
* a request finishes. This way only one thread at a time accesses an
* address. */
private static HashMap BLOCKED_ADDR_TO_TIME = new HashMap();
+
+ /** Maps an address to the number of threads that are accessing that address. */
+ private static HashMap THREADS_PER_HOST_COUNT = new HashMap();
/** Queue of blocked InetAddress. This contains all of the non-zero entries
* from BLOCKED_ADDR_TO_TIME, ordered by increasing time. */
@@ -84,11 +88,25 @@
cleanExpiredServerBlocks(); // free held addresses
Long time;
+ Integer addrCount;
+
synchronized (BLOCKED_ADDR_TO_TIME) {
time = (Long) BLOCKED_ADDR_TO_TIME.get(addr);
if (time == null) { // address is free
- BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
- return addr;
+ synchronized (THREADS_PER_HOST_COUNT) {
+ addrCount = (Integer)THREADS_PER_HOST_COUNT.get(addr); // get # of threads currently accessing this addr
+ if (addrCount == null || addrCount.intValue() == 0) { // no threads accessing yet
+ THREADS_PER_HOST_COUNT.put(addr, new Integer(1));
+ }
+ else {
+ THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount.intValue() + 1));
+ }
+ addrCount = (Integer)THREADS_PER_HOST_COUNT.get(addr);
+ if (addrCount.intValue() >= MAX_THREADS_PER_HOST) {
+ BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
+ }
+ return addr;
+ }
}
}
@@ -129,9 +147,18 @@
private static void unblockAddr(InetAddress addr) {
synchronized (BLOCKED_ADDR_TO_TIME) {
- BLOCKED_ADDR_QUEUE.addFirst(addr);
- BLOCKED_ADDR_TO_TIME.put
- (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+ synchronized (THREADS_PER_HOST_COUNT) {
+ int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
+ if (addrCount == 1) {
+ THREADS_PER_HOST_COUNT.remove(addr);
+ BLOCKED_ADDR_QUEUE.addFirst(addr);
+ BLOCKED_ADDR_TO_TIME.put
+ (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+ }
+ else {
+ THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
+ }
+ }
}
}
