Author: ferdy Date: Thu Apr 26 09:00:17 2012 New Revision: 1330722 URL: http://svn.apache.org/viewvc?rev=1330722&view=rev Log: NUTCH-1340 Increase scalability by only removing markers when they actually exist for DbUpdaterReducer
Modified: nutch/branches/nutchgora/CHANGES.txt nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java Modified: nutch/branches/nutchgora/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1330722&r1=1330721&r2=1330722&view=diff ============================================================================== --- nutch/branches/nutchgora/CHANGES.txt (original) +++ nutch/branches/nutchgora/CHANGES.txt Thu Apr 26 09:00:17 2012 @@ -2,6 +2,8 @@ Nutch Change Log Release nutchgora - Current Development +* NUTCH-1340 Increase scalability by only removing markers when they actually exist for DbUpdaterReducer (ferdy) + * NUTCH-1333 Introduce AvroStore, DataFileAvroStore and Accumulo Datastore implementations (lewismc) * NUTCH-1312 Nutchgora to send HTTP-accept header (ferdy) Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1330722&r1=1330721&r2=1330722&view=diff ============================================================================== --- nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java (original) +++ nutch/branches/nutchgora/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Thu Apr 26 09:00:17 2012 @@ -175,11 +175,14 @@ extends GoraReducer<UrlWithScore, NutchW } // clear markers - - page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED); - Mark.GENERATE_MARK.removeMark(page); - Mark.FETCH_MARK.removeMark(page); - Utf8 mark = Mark.PARSE_MARK.removeMark(page); + // But only delete when they exist. This is much faster for the underlying + // store. The markers are on the input anyway. + if (page.getFromMetadata(FetcherJob.REDIRECT_DISCOVERED) != null) { + page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED); + } + Mark.GENERATE_MARK.removeMarkIfExist(page); + Mark.FETCH_MARK.removeMarkIfExist(page); + Utf8 mark = Mark.PARSE_MARK.removeMarkIfExist(page); if (mark != null) { Mark.UPDATEDB_MARK.putMark(page, mark); } Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java?rev=1330722&r1=1330721&r2=1330722&view=diff ============================================================================== --- nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java (original) +++ nutch/branches/nutchgora/src/java/org/apache/nutch/storage/Mark.java Thu Apr 26 09:00:17 2012 @@ -43,4 +43,16 @@ public enum Mark { public Utf8 checkMark(WebPage page) { return page.getFromMarkers(name); } + + /** + * Remove the mark only if the mark is present on the page. + * @param page The page to remove the mark from. + * @return If the mark was present. + */ + public Utf8 removeMarkIfExist(WebPage page) { + if (page.getFromMarkers(name) != null) { + return page.removeFromMarkers(name); + } + return null; + } }