Author: ab
Date: Fri Nov 3 03:49:14 2006
New Revision: 470767
URL: http://svn.apache.org/viewvc?view=rev&rev=470767
Log:
Fix NUTCH-387 - URL normalization rules may affect the whole URL, although
we count only hosts (possibly changed after normalization).
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=470767&r1=470766&r2=470767
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Nov
3 03:49:14 2006
@@ -153,7 +153,8 @@
Text url = entry.url;
if (maxPerHost > 0) { // are we counting hosts?
- String host = new URL(url.toString()).getHost();
+ URL u = new URL(url.toString());
+ String host = u.getHost();
if (host == null) {
// unknown host, skip
continue;
@@ -174,11 +175,15 @@
continue;
}
}
+ u = new URL(u.getProtocol(), host, u.getPort(), u.getFile());
+ String urlString = u.toString();
try {
- host = normalizers.normalize(host,
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
- host = new URL(host).getHost().toLowerCase();
+ urlString = normalizers.normalize(urlString,
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+ host = new URL(urlString).getHost();
} catch (Exception e) {
- LOG.warn("Malformed URL: '" + host + "', skipping");
+ LOG.warn("Malformed URL: '" + urlString + "', skipping (" +
+ StringUtils.stringifyException(e) + ")");
+ continue;
}
IntWritable hostCount = (IntWritable)hostCounts.get(host);
if (hostCount == null) {
Modified:
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=470767&r1=470766&r2=470767
==============================================================================
---
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
(original)
+++
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Fri Nov 3 03:49:14 2006
@@ -153,8 +153,6 @@
LOG.warn("Couldn't load resource '" + resource + "': " + e);
}
}
- } else {
- LOG.warn("can't load rule file for scope '" + scope + "': " +
configFile);
}
if (curRules == EMPTY_RULES || curRules == null) {
LOG.warn("can't find rules for scope '" + scope + "', using default");
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs