Author: ab Date: Mon Mar 17 05:42:54 2008 New Revision: 637861 URL: http://svn.apache.org/viewvc?rev=637861&view=rev Log: NUTCH-616 Reset Fetch Retry counter when fetch is successful.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:42:54 2008 @@ -232,6 +232,9 @@ 84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval. Guard against reprUrl being null. (Emmanuel Joke, ab) +85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel + Joke, ab) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Mar 17 05:42:54 2008 @@ -33,8 +33,8 @@ public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule { private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class); - private int defaultInterval; - private int maxInterval; + protected int defaultInterval; + protected int maxInterval; public AbstractFetchSchedule() { super(null); @@ -69,12 +69,22 @@ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { datum.setFetchTime(System.currentTimeMillis()); datum.setFetchInterval(defaultInterval); + datum.setRetriesSinceFetch(0); return datum; } - public abstract CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, + /** + * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a + * successfully fetched page. NOTE: this implementation resets the + * retry counter - extending classes should call super.setFetchSchedule() to + * preserve this behavior. + */ + public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, - long fetchTime, long modifiedTime, int state); + long fetchTime, long modifiedTime, int state) { + datum.setRetriesSinceFetch(0); + return datum; + } /** * This method specifies how to schedule refetching of pages @@ -101,7 +111,8 @@ /** * This method adjusts the fetch schedule if fetching needs to be * re-tried due to transient errors. The default implementation - * sets the next fetch time 1 day in the future. + * sets the next fetch time 1 day in the future and increases + * the retry counter. * @param url URL of the page * @param datum page information * @param prevFetchTime previous fetch time @@ -115,6 +126,7 @@ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime) { datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY); + datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); return datum; } @@ -122,7 +134,7 @@ * This method return the last fetch time of the CrawlDatum * @return the date as a long. */ - public long calculateLastFetchTime(CrawlDatum datum){ + public long calculateLastFetchTime(CrawlDatum datum) { return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; } @@ -157,8 +169,8 @@ } /** - * This method resets fetchTime, fetchInterval, modifiedTime and - * page signature, so that it forces refetching. + * This method resets fetchTime, fetchInterval, modifiedTime, + * retriesSinceFetch and page signature, so that it forces refetching. * @param url URL of the page * @param datum datum instance * @param asap if true, force refetch as soon as possible - this sets @@ -170,6 +182,7 @@ if (datum.getFetchInterval() > maxInterval) datum.setFetchInterval(maxInterval * 0.9f); datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); + datum.setRetriesSinceFetch(0); datum.setSignature(null); datum.setModifiedTime(0L); if (asap) datum.setFetchTime(System.currentTimeMillis()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Mar 17 05:42:54 2008 @@ -76,9 +76,12 @@ SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); } + @Override public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) { + super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, + fetchTime, modifiedTime, state); long refTime = fetchTime; if (modifiedTime <= 0) modifiedTime = fetchTime; float interval = datum.getFetchInterval(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Mar 17 05:42:54 2008 @@ -210,15 +210,16 @@ } return; case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure - if (old != null) + if (old != null) { result.setSignature(old.getSignature()); // use old signature - if (fetch.getRetriesSinceFetch() < retryMax) { + } + result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime, + prevModifiedTime, fetch.getFetchTime()); + if (result.getRetriesSinceFetch() < retryMax) { result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); } else { result.setStatus(CrawlDatum.STATUS_DB_GONE); } - result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime, - prevModifiedTime, fetch.getFetchTime()); break; case CrawlDatum.STATUS_FETCH_GONE: // permanent failure Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Mar 17 05:42:54 2008 @@ -29,9 +29,12 @@ */ public class DefaultFetchSchedule extends AbstractFetchSchedule { + @Override public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) { + datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, + fetchTime, modifiedTime, state); datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000); datum.setModifiedTime(modifiedTime); return datum; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Mar 17 05:42:54 2008 @@ -52,7 +52,8 @@ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum); /** - * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page. + * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a + * successfully fetched page. * Implementations may use supplied arguments to support different re-fetching * schedules. * @@ -97,7 +98,8 @@ /** * This method adjusts the fetch schedule if fetching needs to be * re-tried due to transient errors. The default implementation - * sets the next fetch time 1 day in the future. + * sets the next fetch time 1 day in the future and increases the + * retry counter. * @param url URL of the page * @param datum page information * @param prevFetchTime previous fetch time Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:42:54 2008 @@ -209,9 +209,6 @@ logError(url, status.getMessage()); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry - datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1); - /* FALLTHROUGH */ - // intermittent blocking - retry without increasing the counter case ProtocolStatus.WOULDBLOCK: case ProtocolStatus.BLOCKED: output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637861&r1=637860&r2=637861&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:42:54 2008 @@ -611,9 +611,6 @@ logError(fit.url, status.getMessage()); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry - fit.datum.setRetriesSinceFetch(fit.datum.getRetriesSinceFetch()+1); - /* FALLTHROUGH */ - // intermittent blocking - retry without increasing the counter case ProtocolStatus.BLOCKED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); break;