Author: ab Date: Thu Jan 4 03:38:45 2007 New Revision: 492525 URL: http://svn.apache.org/viewvc?view=rev&rev=492525 Log: Use different status code when recording a redirected terget URL without fetching. Fix also an NPE in Crawl when Generator doesn't produce any new segment. Reported by Meghna Kukreja.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=492525&r1=492524&r2=492525 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 4 03:38:45 2007 @@ -117,6 +117,10 @@ for (int i = 0; i < depth; i++) { // generate new segment Path segment = generator.generate(crawlDb, segments, -1, topN, System .currentTimeMillis(), false, false); + if (segment == null) { + LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); + break; + } fetcher.fetch(segment, threads); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segment); // parse it, if needed Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=492525&r1=492524&r2=492525 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan 4 03:38:45 2007 @@ -157,6 +157,8 @@ newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = this.urlFilters.filter(newUrl); if (newUrl != null && !newUrl.equals(url.toString())) { + // record that we were redirected + output(url, datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; @@ -165,7 +167,7 @@ LOG.debug(" - content redirect to " + url + " (fetching now)"); } } else { - output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_FETCH_REDIR_TEMP); + output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - content redirect to " + url + " (fetching later)"); } @@ -198,7 +200,7 @@ LOG.debug(" - protocol redirect to " + url + " (fetching now)"); } } else { - output(url, new CrawlDatum(), null, null, code); + output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - protocol redirect to " + url + " (fetching later)"); }