Author: ab Date: Mon Apr 3 06:35:34 2006 New Revision: 391044 URL: http://svn.apache.org/viewcvs?rev=391044&view=rev Log: Make sure we use new values for score, metadata, fetch interval and fetch time.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=391044&r1=391043&r2=391044&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Apr 3 06:35:34 2006 @@ -25,6 +25,7 @@ /** Merge new page entries with existing entries. */ public class CrawlDbReducer implements Reducer { private int retryMax; + private CrawlDatum result = new CrawlDatum(); public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); @@ -61,36 +62,45 @@ } } - CrawlDatum result = null; + // initialize with the latest version + result.set(highest); + if (old != null) { + // copy metadata from old, if exists + if (old.getMetaData() != null) { + result.getMetaData().putAll(old.getMetaData()); + // overlay with new, if any + if (highest.getMetaData() != null) + result.getMetaData().putAll(highest.getMetaData()); + } + // set the most recent valid value of modifiedTime + if (old.getModifiedTime() > 0 && highest.getModifiedTime() == 0) { + result.setModifiedTime(old.getModifiedTime()); + } + } switch (highest.getStatus()) { // determine new status case CrawlDatum.STATUS_DB_UNFETCHED: // no new entry case CrawlDatum.STATUS_DB_FETCHED: case CrawlDatum.STATUS_DB_GONE: - result = old; // use old + result.set(old); // use old break; case CrawlDatum.STATUS_LINKED: // highest was link if (old != null) { // if old exists - result = old; // use it + result.set(old); // use it } else { - result = highest; // use new entry result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); - result.setScore(1.0f); // initial score is 1.0f } - result.setSignature(null); // reset the signature break; case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch - result = highest; // use new entry - if (highest.getSignature() == null) highest.setSignature(signature); + if (highest.getSignature() == null) result.setSignature(signature); result.setStatus(CrawlDatum.STATUS_DB_FETCHED); result.setNextFetchTime(); break; case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure - result = highest; // use new entry if (old != null) result.setSignature(old.getSignature()); // use old signature if (highest.getRetriesSinceFetch() < retryMax) { @@ -101,7 +111,6 @@ break; case CrawlDatum.STATUS_FETCH_GONE: // permanent failure - result = highest; // use new entry if (old != null) result.setSignature(old.getSignature()); // use old signature result.setStatus(CrawlDatum.STATUS_DB_GONE); @@ -111,10 +120,8 @@ throw new RuntimeException("Unknown status: "+highest.getStatus()); } - if (result != null) { - result.setScore(result.getScore() + scoreIncrement); - output.collect(key, result); - } + result.setScore(result.getScore() + scoreIncrement); + output.collect(key, result); } }