Author: ferdy
Date: Thu Apr 26 09:00:17 2012
New Revision: 1330722

URL: http://svn.apache.org/viewvc?rev=1330722&view=rev
Log:
NUTCH-1340 Increase scalability by only removing markers when they actually 
exist for DbUpdaterReducer

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Thu Apr 26 09:00:17 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
+* NUTCH-1340 Increase scalability by only removing markers when they actually 
exist for DbUpdaterReducer (ferdy)
+
 * NUTCH-1333 Introduce AvroStore, DataFileAvroStore and Accumulo Datastore 
implementations (lewismc)
 
 * NUTCH-1312 Nutchgora to send HTTP-accept header (ferdy)

Modified: 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
--- 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java 
(original)
+++ 
nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java 
Thu Apr 26 09:00:17 2012
@@ -175,11 +175,14 @@ extends GoraReducer<UrlWithScore, NutchW
     }
 
     // clear markers
-
-    page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
-    Mark.GENERATE_MARK.removeMark(page);
-    Mark.FETCH_MARK.removeMark(page);
-    Utf8 mark = Mark.PARSE_MARK.removeMark(page);
+    // But only delete when they exist. This is much faster for the underlying
+    // store. The markers are on the input anyway.
+    if (page.getFromMetadata(FetcherJob.REDIRECT_DISCOVERED) != null) {
+      page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
+    }
+    Mark.GENERATE_MARK.removeMarkIfExist(page);
+    Mark.FETCH_MARK.removeMarkIfExist(page);
+    Utf8 mark = Mark.PARSE_MARK.removeMarkIfExist(page);
     if (mark != null) {
       Mark.UPDATEDB_MARK.putMark(page, mark);
     }

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java?rev=1330722&r1=1330721&r2=1330722&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java 
(original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java Thu 
Apr 26 09:00:17 2012
@@ -43,4 +43,16 @@ public enum Mark {
   public Utf8 checkMark(WebPage page) {
     return page.getFromMarkers(name);
   }
+
+  /**
+   * Remove the mark only if the mark is present on the page.
+   * @param page The page to remove the mark from.
+   * @return If the mark was present.
+   */
+  public Utf8 removeMarkIfExist(WebPage page) {
+    if (page.getFromMarkers(name) != null) {
+      return page.removeFromMarkers(name);
+    }
+    return null;
+  }
 }


Reply via email to