Perhaps you should start from reporting which version you are using
... The version in trunk/ certainly supports this argument. The
version in 0.8.1 does not support it, but it's easy to add.
I've "backported" revision 450799 to the 0.8.x branch for supporting
"-noAdditions". Perhaps you could consider committing it there... (I
haven't tested it yet whough).
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revisión: 492707)
+++ conf/nutch-default.xml (copia de trabajo)
@@ -237,6 +237,15 @@
</property>
<property>
+ <name>db.update.additions.allowed</name>
+ <value>true</value>
+ <description>If true, updatedb will add newly discovered URLs, if false
+ only already existing URLs in the CrawlDb will be updated and no new
+ URLs will be added.
+ </description>
+</property>
+
+<property>
<name>db.ignore.internal.links</name>
<value>true</value>
<description>If true, when adding new links to a page, links from
Index: src/java/org/apache/nutch/crawl/CrawlDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDb.java (revisión: 492707)
+++ src/java/org/apache/nutch/crawl/CrawlDb.java (copia de trabajo)
@@ -36,6 +36,7 @@
* crawldb accordingly.
*/
public class CrawlDb extends Configured {
+ public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
public static final Log LOG = LogFactory.getLog(CrawlDb.class);
@@ -43,16 +44,22 @@
public CrawlDb(Configuration conf) {
super(conf);
}
+
+ public void update(Path crawlDb, Path segment) throws IOException {
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
+ update(crawlDb, segment, additionsAllowed);
+ }
- public void update(Path crawlDb, Path segment) throws IOException {
-
+ public void update(Path crawlDb, Path segment, boolean additionsAllowed) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: starting");
LOG.info("CrawlDb update: db: " + crawlDb);
LOG.info("CrawlDb update: segment: " + segment);
+ LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
}
JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+ job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
@@ -108,13 +115,24 @@
}
public static void main(String[] args) throws Exception {
- CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
+ Configuration c = NutchConfiguration.create();
+ CrawlDb crawlDb = new CrawlDb(c);
if (args.length < 2) {
- System.err.println("Usage: <crawldb> <segment>");
+ System.err.println("Usage: <crawldb> <segment> [-noAdditions]");
+ System.err.println("\tcrawldb\tCrawlDb to update");
+ System.err.println("\tsegment\tsegment name to update from");
+ System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
return;
}
+ boolean additionsAllowed = c.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
+ for(int i = 2 ; i < args.length; i++) {
+ if (args[i].equals("-noAdditions")) {
+ additionsAllowed = false;
+ }
+ }
+
crawlDb.update(new Path(args[0]), new Path(args[1]));
}
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revisión: 492707)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (copia de trabajo)
@@ -36,10 +36,12 @@
private CrawlDatum result = new CrawlDatum();
private ArrayList linked = new ArrayList();
private ScoringFilters scfilters = null;
+ private boolean additionsAllowed;
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
scfilters = new ScoringFilters(job);
+ additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
}
public void close() {}
@@ -74,6 +76,9 @@
}
}
+ // if it doesn't already exist, skip it
+ if (old == null && !additionsAllowed) return;
+
// initialize with the latest version
result.set(highest);
if (old != null) {
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier.
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general