Re: [Nutch-general] How to limit nutch to fetch, refetch and index just the injected URLs?

Nicolás Lichtmaier Fri, 02 Feb 2007 11:22:10 -0800

Perhaps you should start from reporting which version you are using... The version in trunk/ certainly supports this argument. Theversion in 0.8.1 does not support it, but it's easy to add.

I've "backported" revision 450799 to the 0.8.x branch for supporting"-noAdditions". Perhaps you could consider committing it there... (Ihaven't tested it yet whough).

Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revisiÃ³n: 492707)
+++ conf/nutch-default.xml	(copia de trabajo)
@@ -237,6 +237,15 @@
 </property>
 
 <property>
+  <name>db.update.additions.allowed</name>
+  <value>true</value>
+  <description>If true, updatedb will add newly discovered URLs, if false
+  only already existing URLs in the CrawlDb will be updated and no new
+  URLs will be added.
+  </description>
+</property>
+
+<property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from
Index: src/java/org/apache/nutch/crawl/CrawlDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDb.java	(revisiÃ³n: 492707)
+++ src/java/org/apache/nutch/crawl/CrawlDb.java	(copia de trabajo)
@@ -36,6 +36,7 @@
  * crawldb accordingly.
  */
 public class CrawlDb extends Configured {
+  public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
 
   public static final Log LOG = LogFactory.getLog(CrawlDb.class);
 
@@ -43,16 +44,22 @@
   public CrawlDb(Configuration conf) {
     super(conf);
   }
+  
+  public void update(Path crawlDb, Path segment) throws IOException {
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
+    update(crawlDb, segment, additionsAllowed);
+  }
 
-  public void update(Path crawlDb, Path segment) throws IOException {
-    
+  public void update(Path crawlDb, Path segment, boolean additionsAllowed) throws IOException {    
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb update: starting");
       LOG.info("CrawlDb update: db: " + crawlDb);
       LOG.info("CrawlDb update: segment: " + segment);
+      LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
     }
 
     JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
     job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
     job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
 
@@ -108,13 +115,24 @@
   }
 
   public static void main(String[] args) throws Exception {
-    CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
+    Configuration c = NutchConfiguration.create();
+    CrawlDb crawlDb = new CrawlDb(c);
     
     if (args.length < 2) {
-      System.err.println("Usage: <crawldb> <segment>");
+      System.err.println("Usage: <crawldb> <segment> [-noAdditions]");
+      System.err.println("\tcrawldb\tCrawlDb to update");
+      System.err.println("\tsegment\tsegment name to update from");
+      System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
       return;
     }
     
+    boolean additionsAllowed = c.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
+    for(int i = 2 ; i < args.length; i++) {
+      if (args[i].equals("-noAdditions")) {
+        additionsAllowed = false;
+      }
+    }
+    
     crawlDb.update(new Path(args[0]), new Path(args[1]));
   }
 
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revisiÃ³n: 492707)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(copia de trabajo)
@@ -36,10 +36,12 @@
   private CrawlDatum result = new CrawlDatum();
   private ArrayList linked = new ArrayList();
   private ScoringFilters scfilters = null;
+  private boolean additionsAllowed;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
     scfilters = new ScoringFilters(job);
+    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
   }
 
   public void close() {}
@@ -74,6 +76,9 @@
       }
     }
 
+    // if it doesn't already exist, skip it
+    if (old == null && !additionsAllowed) return;
+    
     // initialize with the latest version
     result.set(highest);
     if (old != null) {

-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier.
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642

_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general

Re: [Nutch-general] How to limit nutch to fetch, refetch and index just the injected URLs?

Reply via email to