Author: ab
Date: Fri Nov  3 03:49:14 2006
New Revision: 470767

URL: http://svn.apache.org/viewvc?view=rev&rev=470767
Log:
Fix NUTCH-387 - URL normalization rules may affect the whole URL, although
we count only hosts (possibly changed after normalization).

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=470767&r1=470766&r2=470767
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Nov  
3 03:49:14 2006
@@ -153,7 +153,8 @@
         Text url = entry.url;
 
         if (maxPerHost > 0) {                     // are we counting hosts?
-          String host = new URL(url.toString()).getHost();
+          URL u = new URL(url.toString());
+          String host = u.getHost();
           if (host == null) {
             // unknown host, skip
             continue;
@@ -174,11 +175,15 @@
               continue;
             }
           }
+          u = new URL(u.getProtocol(), host, u.getPort(), u.getFile());
+          String urlString = u.toString();
           try {
-            host = normalizers.normalize(host, 
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
-            host = new URL(host).getHost().toLowerCase();
+            urlString = normalizers.normalize(urlString, 
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+            host = new URL(urlString).getHost();
           } catch (Exception e) {
-            LOG.warn("Malformed URL: '" + host + "', skipping");
+            LOG.warn("Malformed URL: '" + urlString + "', skipping (" +
+                StringUtils.stringifyException(e) + ")");
+            continue;
           }
           IntWritable hostCount = (IntWritable)hostCounts.get(host);
           if (hostCount == null) {

Modified: 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=470767&r1=470766&r2=470767
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Fri Nov  3 03:49:14 2006
@@ -153,8 +153,6 @@
             LOG.warn("Couldn't load resource '" + resource + "': " + e);
           }
         }
-      } else {
-        LOG.warn("can't load rule file for scope '" + scope + "': " + 
configFile);
       }
       if (curRules == EMPTY_RULES || curRules == null) {
         LOG.warn("can't find rules for scope '" + scope + "', using default");



-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to