Author: dogacan Date: Sat Jun 16 03:33:24 2007 New Revision: 547901 URL: http://svn.apache.org/viewvc?view=rev&rev=547901 Log: NUTCH-495 - Unnecessary delays in Fetcher2.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=547901&r1=547900&r2=547901 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Jun 16 03:33:24 2007 @@ -30,7 +30,7 @@ 10. NUTCH-392 - OutputFormat implementations should pass on Progressable. (cutting via ab) - +11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=547901&r1=547900&r2=547901 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sat Jun 16 03:33:24 2007 @@ -195,7 +195,7 @@ private static class FetchItemQueue { List<FetchItem> queue = Collections.synchronizedList(new LinkedList<FetchItem>()); Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet<FetchItem>()); - AtomicLong endTime = new AtomicLong(); + AtomicLong nextFetchTime = new AtomicLong(); long crawlDelay; long minCrawlDelay; int maxThreads; @@ -207,7 +207,7 @@ this.crawlDelay = crawlDelay; this.minCrawlDelay = minCrawlDelay; // ready to start - this.endTime.set(System.currentTimeMillis() - crawlDelay); + setEndTime(System.currentTimeMillis() - crawlDelay); } public int getQueueSize() { @@ -218,10 +218,10 @@ return inProgress.size(); } - public void finishFetchItem(FetchItem it) { + public void finishFetchItem(FetchItem it, boolean asap) { if (it != null) { inProgress.remove(it); - endTime.set(System.currentTimeMillis()); + setEndTime(System.currentTimeMillis(), asap); } } @@ -238,8 +238,7 @@ public FetchItem getFetchItem() { if (inProgress.size() >= maxThreads) return null; long now = System.currentTimeMillis(); - long last = endTime.get() + (maxThreads > 1 ? minCrawlDelay : crawlDelay); - if (last > now) return null; + if (nextFetchTime.get() > now) return null; FetchItem it = null; if (queue.size() == 0) return null; try { @@ -256,13 +255,24 @@ LOG.info(" inProgress = " + inProgress.size()); LOG.info(" crawlDelay = " + crawlDelay); LOG.info(" minCrawlDelay = " + minCrawlDelay); - LOG.info(" endTime = " + endTime.get()); + LOG.info(" nextFetchTime = " + nextFetchTime.get()); LOG.info(" now = " + System.currentTimeMillis()); for (int i = 0; i < queue.size(); i++) { FetchItem it = queue.get(i); LOG.info(" " + i + ". " + it.url); } } + + private void setEndTime(long endTime) { + setEndTime(endTime, false); + } + + private void setEndTime(long endTime, boolean asap) { + if (!asap) + nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay)); + else + nextFetchTime.set(endTime); + } } /** @@ -308,12 +318,16 @@ } public void finishFetchItem(FetchItem it) { + finishFetchItem(it, false); + } + + public void finishFetchItem(FetchItem it, boolean asap) { FetchItemQueue fiq = queues.get(it.queueID); if (fiq == null) { LOG.warn("Attempting to finish item from unknown queue: " + it); return; } - fiq.finishFetchItem(it); + fiq.finishFetchItem(it, asap); } public synchronized FetchItemQueue getFetchItemQueue(String id) { @@ -474,7 +488,7 @@ RobotRules rules = protocol.getRobotRules(fit.url, fit.datum); if (!rules.isAllowed(fit.u)) { // unblock - fetchQueues.finishFetchItem(fit); + fetchQueues.finishFetchItem(fit, true); if (LOG.isDebugEnabled()) { LOG.debug("Denied by robots.txt: " + fit.url); } @@ -484,7 +498,7 @@ if (rules.getCrawlDelay() > 0) { if (rules.getCrawlDelay() > maxCrawlDelay) { // unblock - fetchQueues.finishFetchItem(fit); + fetchQueues.finishFetchItem(fit, true); LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); continue; @@ -503,8 +517,6 @@ switch(status.getCode()) { case ProtocolStatus.WOULDBLOCK: - // unblock - fetchQueues.finishFetchItem(fit); // retry ? fetchQueues.addFetchItem(fit); break; ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs