Author: lewismc
Date: Wed Sep 16 04:22:47 2015
New Revision: 1703331

URL: http://svn.apache.org/r1703331
Log:
NUTCH-1679 UpdateDb using batchId, link may override crawled page

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1703331&r1=1703330&r2=1703331&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Sep 16 04:22:47 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1679 UpdateDb using batchId, link may override crawled page (Tien 
Nguyen Manh, Koen Smets, Alfonso Nishikawa, Alexander Kingson via lewismc)
+
 * NUTCH-2077 Upgrade to Tika 1.10 (Michael Joyce, lewismc)
 
 * NUTCH-2045 index-basic incorrect assignment of next fetch time 
(page.getFetchTime()) as page fetch time (lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1703331&r1=1703330&r2=1703331&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Wed 
Sep 16 04:22:47 2015
@@ -35,10 +35,12 @@ import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.WebPageWritable;
+import org.apache.nutch.storage.StorageUtils;
+import org.apache.gora.store.DataStore;
 import org.slf4j.Logger;
 
 public class DbUpdateReducer extends
-    GoraReducer<UrlWithScore, NutchWritable, String, WebPage> {
+GoraReducer<UrlWithScore, NutchWritable, String, WebPage> {
 
   public static final String CRAWLDB_ADDITIONS_ALLOWED = 
"db.update.additions.allowed";
 
@@ -51,10 +53,11 @@ public class DbUpdateReducer extends
   private ScoringFilters scoringFilters;
   private List<ScoreDatum> inlinkedScoreData = new ArrayList<ScoreDatum>();
   private int maxLinks;
+  public DataStore<String, WebPage> datastore;
 
   @Override
   protected void setup(Context context) throws IOException,
-      InterruptedException {
+  InterruptedException {
     Configuration conf = context.getConfiguration();
     retryMax = conf.getInt("db.fetch.retry.max", 3);
     additionsAllowed = conf.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
@@ -62,6 +65,17 @@ public class DbUpdateReducer extends
     schedule = FetchScheduleFactory.getFetchSchedule(conf);
     scoringFilters = new ScoringFilters(conf);
     maxLinks = conf.getInt("db.update.max.inlinks", 10000);
+    try {
+      datastore = StorageUtils.createWebStore(conf, String.class, 
WebPage.class);
+    }
+    catch (ClassNotFoundException e) {
+      throw new IOException(e);
+    }
+  }
+  
+  @Override
+  protected void cleanup(Context context) throws IOException, 
InterruptedException {
+    datastore.close();
   }
 
   @Override
@@ -70,6 +84,8 @@ public class DbUpdateReducer extends
     String keyUrl = key.getUrl().toString();
 
     WebPage page = null;
+    //initialize old_page for checking if the outlink is already in the 
datastore
+    WebPage old_page = null;
     inlinkedScoreData.clear();
 
     for (NutchWritable nutchWritable : values) {
@@ -94,7 +110,12 @@ public class DbUpdateReducer extends
       return;
     }
 
-    if (page == null) { // new row
+    //check if page is already in the db
+    if(page == null && (old_page = datastore.get(keyUrl)) != null) { 
+      //if we return here inlinks will not be updated
+      page=old_page;
+    } 
+    else if (page == null) { //new row 
       if (!additionsAllowed) {
         return;
       }


Reply via email to