CrawlDbReducer.java

ab Thu, 28 Sep 2006 03:48:52 -0700

Author: ab
Date: Thu Sep 28 03:48:25 2006
New Revision: 450799

URL: http://svn.apache.org/viewvc?view=rev&rev=450799
Log:
Bring back the '-noAdditions' option. This is useful for running
constrained crawls, where the complete list of URLs is known in
advance.


Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=450799&r1=450798&r2=450799
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Sep 28 03:48:25 2006
@@ -237,6 +237,15 @@
 </property>
 
 <property>
+  <name>db.update.additions.allowed</name>
+  <value>true</value>
+  <description>If true, updatedb will add newly discovered URLs, if false
+  only already existing URLs in the CrawlDb will be updated and no new
+  URLs will be added.
+  </description>
+</property>
+
+<property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diff&rev=450799&r1=450798&r2=450799
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Sep 28 
03:48:25 2006
@@ -38,6 +38,7 @@
  * crawldb accordingly.
  */
 public class CrawlDb extends ToolBase {
+  public static final String CRAWLDB_ADDITIONS_ALLOWED = 
"db.update.additions.allowed";
 
   public static final Log LOG = LogFactory.getLog(CrawlDb.class);
   
@@ -50,16 +51,23 @@
   }
 
   public void update(Path crawlDb, Path segment, boolean normalize, boolean 
filter) throws IOException {
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, 
true);    
+    update(crawlDb, segment, normalize, filter, additionsAllowed);
+  }
+  
+  public void update(Path crawlDb, Path segment, boolean normalize, boolean 
filter, boolean additionsAllowed) throws IOException {
     
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb update: starting");
       LOG.info("CrawlDb update: db: " + crawlDb);
       LOG.info("CrawlDb update: segment: " + segment);
+      LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
       LOG.info("CrawlDb update: URL normalizing: " + normalize);
       LOG.info("CrawlDb update: URL filtering: " + filter);
     }
 
     JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
     job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
     job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
     job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
@@ -122,26 +130,30 @@
 
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: CrawlDb <crawldb> <segment> [-normalize] 
[-filter]");
+      System.err.println("Usage: CrawlDb <crawldb> <segment> [-normalize] 
[-filter] [-noAdditions]");
       System.err.println("\tcrawldb\tCrawlDb to update");
       System.err.println("\tsegment\tsegment name to update from");
       System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb 
and segment (usually not needed)");
       System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and 
segment");
+      System.err.println("\t-noAdditions\tonly update already existing URLs, 
don't add any newly discovered URLs");
       return -1;
     }
     boolean normalize = false;
     boolean filter = false;
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, 
true);
     if (args.length > 2) {
       for (int i = 2; i < args.length; i++) {
         if (args[i].equals("-normalize")) {
           normalize = true;
         } else if (args[i].equals("-filter")) {
           filter = true;
+        } else if (args[i].equals("-noAdditions")) {
+          additionsAllowed = false;
         }
       }
     }
     try {
-      update(new Path(args[0]), new Path(args[1]), normalize, filter);
+      update(new Path(args[0]), new Path(args[1]), normalize, filter, 
additionsAllowed);
       return 0;
     } catch (Exception e) {
       LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e));

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=450799&r1=450798&r2=450799
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu 
Sep 28 03:48:25 2006
@@ -36,10 +36,12 @@
   private CrawlDatum result = new CrawlDatum();
   private ArrayList linked = new ArrayList();
   private ScoringFilters scfilters = null;
+  private boolean additionsAllowed;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
     scfilters = new ScoringFilters(job);
+    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
   }
 
   public void close() {}
@@ -74,6 +76,9 @@
       }
     }
 
+    // if it doesn't already exist, skip it
+    if (old == null && !additionsAllowed) return;
+    
     // initialize with the latest version
     result.set(highest);
     if (old != null) {

svn commit: r450799 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDb.java src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Reply via email to