Author: ab
Date: Thu Jan 4 03:38:45 2007
New Revision: 492525
URL: http://svn.apache.org/viewvc?view=rev&rev=492525
Log:
Use different status code when recording a redirected terget URL without
fetching. Fix also an NPE in Crawl when Generator doesn't produce any
new segment. Reported by Meghna Kukreja.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 4
03:38:45 2007
@@ -117,6 +117,10 @@
for (int i = 0; i < depth; i++) { // generate new segment
Path segment = generator.generate(crawlDb, segments, -1, topN, System
.currentTimeMillis(), false, false);
+ if (segment == null) {
+ LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
+ break;
+ }
fetcher.fetch(segment, threads); // fetch it
if (!Fetcher.isParsing(job)) {
parseSegment.parse(segment); // parse it, if needed
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan
4 03:38:45 2007
@@ -157,6 +157,8 @@
newUrl = normalizers.normalize(newUrl,
URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
+ // record that we were redirected
+ output(url, datum, null, status,
CrawlDatum.STATUS_FETCH_REDIR_PERM);
url = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
@@ -165,7 +167,7 @@
LOG.debug(" - content redirect to " + url + "
(fetching now)");
}
} else {
- output(url, new CrawlDatum(), null, null,
CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+ output(url, new CrawlDatum(), null, null,
CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect to " + url + "
(fetching later)");
}
@@ -198,7 +200,7 @@
LOG.debug(" - protocol redirect to " + url + " (fetching
now)");
}
} else {
- output(url, new CrawlDatum(), null, null, code);
+ output(url, new CrawlDatum(), null, null,
CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - protocol redirect to " + url + " (fetching
later)");
}
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys - and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs