Author: lewismc Date: Wed Sep 16 04:22:47 2015 New Revision: 1703331 URL: http://svn.apache.org/r1703331 Log: NUTCH-1679 UpdateDb using batchId, link may override crawled page
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1703331&r1=1703330&r2=1703331&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Wed Sep 16 04:22:47 2015 @@ -2,6 +2,8 @@ Nutch Change Log Current Development 2.4-SNAPSHOT +* NUTCH-1679 UpdateDb using batchId, link may override crawled page (Tien Nguyen Manh, Koen Smets, Alfonso Nishikawa, Alexander Kingson via lewismc) + * NUTCH-2077 Upgrade to Tika 1.10 (Michael Joyce, lewismc) * NUTCH-2045 index-basic incorrect assignment of next fetch time (page.getFetchTime()) as page fetch time (lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1703331&r1=1703330&r2=1703331&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Wed Sep 16 04:22:47 2015 @@ -35,10 +35,12 @@ import org.apache.nutch.storage.Mark; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.TableUtil; import org.apache.nutch.util.WebPageWritable; +import org.apache.nutch.storage.StorageUtils; +import org.apache.gora.store.DataStore; import org.slf4j.Logger; public class DbUpdateReducer extends - GoraReducer<UrlWithScore, NutchWritable, String, WebPage> { +GoraReducer<UrlWithScore, NutchWritable, String, WebPage> { public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed"; @@ -51,10 +53,11 @@ public class DbUpdateReducer extends private ScoringFilters scoringFilters; private List<ScoreDatum> inlinkedScoreData = new ArrayList<ScoreDatum>(); private int maxLinks; + public DataStore<String, WebPage> datastore; @Override protected void setup(Context context) throws IOException, - InterruptedException { + InterruptedException { Configuration conf = context.getConfiguration(); retryMax = conf.getInt("db.fetch.retry.max", 3); additionsAllowed = conf.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); @@ -62,6 +65,17 @@ public class DbUpdateReducer extends schedule = FetchScheduleFactory.getFetchSchedule(conf); scoringFilters = new ScoringFilters(conf); maxLinks = conf.getInt("db.update.max.inlinks", 10000); + try { + datastore = StorageUtils.createWebStore(conf, String.class, WebPage.class); + } + catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + @Override + protected void cleanup(Context context) throws IOException, InterruptedException { + datastore.close(); } @Override @@ -70,6 +84,8 @@ public class DbUpdateReducer extends String keyUrl = key.getUrl().toString(); WebPage page = null; + //initialize old_page for checking if the outlink is already in the datastore + WebPage old_page = null; inlinkedScoreData.clear(); for (NutchWritable nutchWritable : values) { @@ -94,7 +110,12 @@ public class DbUpdateReducer extends return; } - if (page == null) { // new row + //check if page is already in the db + if(page == null && (old_page = datastore.get(keyUrl)) != null) { + //if we return here inlinks will not be updated + page=old_page; + } + else if (page == null) { //new row if (!additionsAllowed) { return; }