Author: fenglu
Date: Thu Sep  5 14:40:25 2013
New Revision: 1520332

URL: http://svn.apache.org/r1520332
Log:
NUTCH-1556 enabling updatedb to accept batchId

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/bin/crawl
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Sep  5 14:40:25 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1556 enabling updatedb to accept batchId (kaveh minooie,Feng)
+
 * NUTCH-1619 Writes Dmoz Description and Title information to db with snippet 
argument ( Yasin Kılınç via feng)
 
 * NUTCH-1631 Display Document Count Added To Solr Server (Furkan KAMACI via 
lewismc)

Modified: nutch/branches/2.x/src/bin/crawl
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/bin/crawl (original)
+++ nutch/branches/2.x/src/bin/crawl Thu Sep  5 14:40:25 2013
@@ -141,7 +141,7 @@ do
 
   # updatedb with this batch
   echo "CrawlDB update for $CRAWL_ID"
-  $bin/nutch updatedb $commonOptions -crawlId $CRAWL_ID
+  $bin/nutch updatedb $commonOptions -batchId $batchId -crawlId $CRAWL_ID
 
   if [ $? -ne 0 ] 
   then exit $? 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Thu 
Sep  5 14:40:25 2013
@@ -23,6 +23,9 @@ import java.util.Map;
 import java.util.Map.Entry;
 
 import org.apache.avro.util.Utf8;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.util.NutchJob;
 import org.slf4j.Logger;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.util.StringUtils;
@@ -41,6 +44,8 @@ extends GoraMapper<String, WebPage, UrlW
   private ScoringFilters scoringFilters;
 
   private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
+
+  private Utf8 batchId;
   
   //reuse writables
   private UrlWithScore urlWithScore = new UrlWithScore();
@@ -51,6 +56,14 @@ extends GoraMapper<String, WebPage, UrlW
   public void map(String key, WebPage page, Context context)
   throws IOException, InterruptedException {
 
+    Utf8 mark = Mark.GENERATE_MARK.checkMark(page);
+    if(!NutchJob.shouldProcess(mark,batchId)) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; different 
batch id (" + mark + ")");
+      }
+      return;
+    }
+
     String url = TableUtil.unreverseUrl(key);
 
     scoreData.clear();
@@ -93,6 +106,7 @@ extends GoraMapper<String, WebPage, UrlW
   public void setup(Context context) {
     scoringFilters = new ScoringFilters(context.getConfiguration());
     pageWritable = new WebPageWritable(context.getConfiguration(), null);
+    batchId = new 
Utf8(context.getConfiguration().get(Nutch.BATCH_NAME_KEY,Nutch.ALL_BATCH_ID_STR));
   }
 
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1520332&r1=1520331&r2=1520332&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Thu 
Sep  5 14:40:25 2013
@@ -73,12 +73,17 @@ public class DbUpdaterJob extends NutchT
     
   public Map<String,Object> run(Map<String,Object> args) throws Exception {
     String crawlId = (String)args.get(Nutch.ARG_CRAWL);
+    String batchId = (String)args.get(Nutch.ARG_BATCH);
     numJobs = 1;
     currentJobNum = 0;
     currentJob = new NutchJob(getConf(), "update-table");
     if (crawlId != null) {
       currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
     }
+    if (batchId == null) {
+      batchId = Nutch.ALL_BATCH_ID_STR;
+    }
+    getConf().set(Nutch.BATCH_NAME_KEY, batchId);
     //job.setBoolean(ALL, updateAll);
     ScoringFilters scoringFilters = new ScoringFilters(getConf());
     HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
@@ -100,23 +105,46 @@ public class DbUpdaterJob extends NutchT
     return results;
   }
   
-  private int updateTable(String crawlId) throws Exception {
+  private int updateTable(String crawlId,String batchId) throws Exception {
     LOG.info("DbUpdaterJob: starting");
-    run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId));
+    if (batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
+      LOG.info("DbUpdaterJob: updatinging all");
+    } else {
+      LOG.info("DbUpdaterJob: batchId: " + batchId);
+    }
+    run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId,
+            Nutch.ARG_BATCH, batchId));
     LOG.info("DbUpdaterJob: done");
     return 0;
   }
 
   public int run(String[] args) throws Exception {
     String crawlId = null;
+    String batchId;
+
+    String usage = "Usage: DbUpdaterJob (<batchId> | -all) [-crawlId <id>] " +
+            "    <batchId>     - crawl identifier returned by Generator, or 
-all for all \n \t \t    generated batchId-s\n" +
+            "    -crawlId <id> - the id to prefix the schemas to operate on, 
\n \t \t    (default: storage.crawl.id)\n";
+
     if (args.length == 0) {
-      //
-    } else if (args.length == 2 && "-crawlId".equals(args[0])) {
-      crawlId = args[1];
-    } else {
-      throw new IllegalArgumentException("usage: " + "(-crawlId <id>)");
+      System.err.println(usage);
+      return -1;
+    }
+
+    batchId = args[0];
+    if (!batchId.equals("-all") && batchId.startsWith("-")) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    for (int i = 1; i < args.length; i++) {
+      if ("-crawlId".equals(args[i])) {
+        getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+      } else {
+        throw new IllegalArgumentException("arg " +args[i]+ " not recognized");
+      }
     }
-    return updateTable(crawlId);
+    return updateTable(crawlId,batchId);
   }
 
   public static void main(String[] args) throws Exception {


Reply via email to