Author: ab Date: Thu Sep 28 03:48:25 2006 New Revision: 450799 URL: http://svn.apache.org/viewvc?view=rev&rev=450799 Log: Bring back the '-noAdditions' option. This is useful for running constrained crawls, where the complete list of URLs is known in advance.
Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=450799&r1=450798&r2=450799 ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Sep 28 03:48:25 2006 @@ -237,6 +237,15 @@ </property> <property> + <name>db.update.additions.allowed</name> + <value>true</value> + <description>If true, updatedb will add newly discovered URLs, if false + only already existing URLs in the CrawlDb will be updated and no new + URLs will be added. + </description> +</property> + +<property> <name>db.ignore.internal.links</name> <value>true</value> <description>If true, when adding new links to a page, links from Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diff&rev=450799&r1=450798&r2=450799 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Sep 28 03:48:25 2006 @@ -38,6 +38,7 @@ * crawldb accordingly. */ public class CrawlDb extends ToolBase { + public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed"; public static final Log LOG = LogFactory.getLog(CrawlDb.class); @@ -50,16 +51,23 @@ } public void update(Path crawlDb, Path segment, boolean normalize, boolean filter) throws IOException { + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); + update(crawlDb, segment, normalize, filter, additionsAllowed); + } + + public void update(Path crawlDb, Path segment, boolean normalize, boolean filter, boolean additionsAllowed) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting"); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segment: " + segment); + LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); } JobConf job = CrawlDb.createJob(getConf(), crawlDb); + job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME)); @@ -122,26 +130,30 @@ public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: CrawlDb <crawldb> <segment> [-normalize] [-filter]"); + System.err.println("Usage: CrawlDb <crawldb> <segment> [-normalize] [-filter] [-noAdditions]"); System.err.println("\tcrawldb\tCrawlDb to update"); System.err.println("\tsegment\tsegment name to update from"); System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)"); System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment"); + System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs"); return -1; } boolean normalize = false; boolean filter = false; + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-normalize")) { normalize = true; } else if (args[i].equals("-filter")) { filter = true; + } else if (args[i].equals("-noAdditions")) { + additionsAllowed = false; } } } try { - update(new Path(args[0]), new Path(args[1]), normalize, filter); + update(new Path(args[0]), new Path(args[1]), normalize, filter, additionsAllowed); return 0; } catch (Exception e) { LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e)); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=450799&r1=450798&r2=450799 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Sep 28 03:48:25 2006 @@ -36,10 +36,12 @@ private CrawlDatum result = new CrawlDatum(); private ArrayList linked = new ArrayList(); private ScoringFilters scfilters = null; + private boolean additionsAllowed; public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); scfilters = new ScoringFilters(job); + additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); } public void close() {} @@ -74,6 +76,9 @@ } } + // if it doesn't already exist, skip it + if (old == null && !additionsAllowed) return; + // initialize with the latest version result.set(highest); if (old != null) {