Hello,

I think Doug just applied this patch earlier today.  I had a quick look
at it, since I normalize URLs over at simpy.com, too.

Are you sure the following is the right thing to do?

+      if (file == null || "".equals(file)) {    // add a slash
+        file = "/";
+        changed = true;
+      }

I _think_ not all web servers will handle this added '/' the same way. 
I cannot provide any concrete examples, but this type of stuff is very
touchy, since not all web servers implement URL handling the same way,
just like lots of them break HTTP 1.0/1.1 standards, etc.

In short, I think it may be safer to leave that file portion alone,
instead of assuming that the server will interpret '/' the same way it
would interpret ''.

Otis



--- Luke Baker <[EMAIL PROTECTED]> wrote:

> Hey,
> 
> Attached is a patch which will allow people to easily create their
> own 
> URL normalization class, just like what can be done with the URL 
> filters.  I haven't made any changes in functionality, just made 
> UrlNormalize an interface instead of a class and used it
> appropriately 
> (hopefully).  I changed the JUnit test as well, which it passed. Let
> me 
> know if I need any changes or of any objections as to how things were
> done.
> 
> Thanks,
> 
> Luke Baker
> > diff -Nur --exclude='*.txt' --exclude='*-site.xml'
--exclude='*.html'
> --exclude='*.jar' --exclude='*.class' nutch/conf/nutch-default.xml
> nutch-changed/conf/nutch-default.xml
> --- nutch/conf/nutch-default.xml      2004-08-09 19:23:52.000000000 -0400
> +++ nutch-changed/conf/nutch-default.xml      2004-08-27
> 16:07:31.000000000 -0400
> @@ -391,6 +391,14 @@
>    <description>Name of file on CLASSPATH containing default regular
>    expressions used by RegexURLFilter.</description>
>  </property>
> +                                                                    
>                                    
> +<!-- URL normalizer properties -->
> +
> +<property>
> +  <name>urlnormalizer.class</name>
> +  <value>net.nutch.net.BasicUrlNormalizer</value>
> +  <description>Name of the class used to normalize
> URLs.</description>
> +</property>
>  
>  <!-- mime properties -->
>  
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/db/Link.java
> nutch-changed/src/java/net/nutch/db/Link.java
> --- nutch/src/java/net/nutch/db/Link.java     2003-05-21
> 12:25:10.000000000 -0400
> +++ nutch-changed/src/java/net/nutch/db/Link.java     2004-08-27
> 16:03:22.000000000 -0400
> @@ -9,7 +9,7 @@
>  
>  import net.nutch.io.*;
>  import net.nutch.util.*;
> -import net.nutch.net.UrlNormalizer;
> +import net.nutch.net.UrlNormalizerFactory;
>  
>  /*********************************************
>   * This is the field in the Link Database.
> @@ -56,7 +56,7 @@
>      public Link(MD5Hash fromID, long domainID, String urlString,
> String anchorText)
>        throws MalformedURLException {
>          this.fromID = fromID;
> -        this.url = new UTF8(UrlNormalizer.normalize(urlString));
> +        this.url = new
> UTF8(UrlNormalizerFactory.getNormalizer().normalize(urlString));
>          this.domainID = domainID;
>          
>          // truncate long anchors
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/db/Page.java
> nutch-changed/src/java/net/nutch/db/Page.java
> --- nutch/src/java/net/nutch/db/Page.java     2003-05-21
> 12:25:10.000000000 -0400
> +++ nutch-changed/src/java/net/nutch/db/Page.java     2004-08-27
> 16:03:12.000000000 -0400
> @@ -9,7 +9,7 @@
>  
>  import net.nutch.io.*;
>  import net.nutch.util.*;
> -import net.nutch.net.UrlNormalizer;
> +import net.nutch.net.UrlNormalizerFactory;
>  
>  /*********************************************
>   * A row in the Page Database.
> @@ -181,7 +181,7 @@
>    //
>    public UTF8 getURL() { return url; }
>    public void setURL(String url) throws MalformedURLException {
> -    this.url = new UTF8(UrlNormalizer.normalize(url));
> +    this.url = new
> UTF8(UrlNormalizerFactory.getNormalizer().normalize(url));
>    }
>  
>    public MD5Hash getMD5() { return md5; }
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/net/BasicUrlNormalizer.java
> nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java
> --- nutch/src/java/net/nutch/net/BasicUrlNormalizer.java      1969-12-31
> 19:00:00.000000000 -0500
> +++ nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java
> 2004-08-27 16:01:49.000000000 -0400
> @@ -0,0 +1,71 @@
> +/* Copyright (c) 2003 The Nutch Organization.  All rights reserved. 
>  */
> +/* Use subject to the conditions in
> http://www.nutch.org/LICENSE.txt. */
> +
> +package net.nutch.net;
> +
> +import java.net.URL;
> +import java.net.MalformedURLException;
> +// import java.net.URI;
> +// import java.net.URISyntaxException;
> +
> +import java.util.logging.Logger;
> +import net.nutch.util.LogFormatter;
> +
> +/** Converts URLs to a normal form . */
> +public class BasicUrlNormalizer implements UrlNormalizer {
> +  public static final Logger LOG =
> +    LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
> +
> +  public String normalize(String urlString)
> +    throws MalformedURLException {
> +
> +    if ("".equals(urlString))                     // permit empty
> +      return urlString;
> +
> +    urlString = urlString.trim();                 // remove extra
> spaces
> +
> +    URL url = new URL(urlString);
> +
> +    String protocol = url.getProtocol();
> +    String host = url.getHost();
> +    int port = url.getPort();
> +    String file = url.getFile();
> +
> +    boolean changed = false;
> +
> +    if (!urlString.startsWith(protocol))        // protocol was
> lowercased
> +      changed = true;
> +
> +    if ("http".equals(protocol) || "ftp".equals(protocol)) {
> +      
> +      if (host != null) {
> +        String newHost = host.toLowerCase();    // lowercase host
> +        if (!host.equals(newHost)) {
> +          host = newHost;
> +          changed = true;
> +        }
> +      }
> +
> +      if (port == url.getDefaultPort()) {       // uses default port
> +        port = -1;                              // so don't specify
> it
> +        changed = true;
> +      }
> +
> +      if (file == null || "".equals(file)) {    // add a slash
> +        file = "/";
> +        changed = true;
> +      }
> +
> +      if (url.getRef() != null) {                 // remove the ref
> +        changed = true;
> +      }
> +
> +    }
> +
> +    if (changed)
> +      urlString = new URL(protocol, host, port, file).toString();
> +
> +    return urlString;
> +  }
> +
> +}
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/net/UrlNormalizerFactory.java
> nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java
> --- nutch/src/java/net/nutch/net/UrlNormalizerFactory.java    1969-12-31
> 19:00:00.000000000 -0500
> +++ nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java
> 2004-08-27 16:02:04.000000000 -0400
> @@ -0,0 +1,38 @@
> +/* Copyright (c) 2003 The Nutch Organization.  All rights reserved. 
>  */
> +/* Use subject to the conditions in
> http://www.nutch.org/LICENSE.txt. */
> +
> +package net.nutch.net;
> +
> +import net.nutch.util.*;
> +import java.util.logging.*;
> +
> +/** Factory to create a UrlNormalizer from "urlnormalizer.class"
> config property. */
> +public class UrlNormalizerFactory {
> +  private static final Logger LOG =
> +    LogFormatter.getLogger("net.nutch.net.UrlNormalizerFactory");
> +
> +  private static final String URLNORMALIZER_CLASS =
> +    NutchConf.get("urlnormalizer.class");
> +
> +  private UrlNormalizerFactory() {}                   // no public
> ctor
> +
> +  private static UrlNormalizer normalizer;
> +
> +  /** Return the default UrlNormalizer implementation. */
> +  public static UrlNormalizer getNormalizer() {
> +
> +    if (normalizer == null) {
> +      try {
> +        LOG.info("Using URL normalizer: " + URLNORMALIZER_CLASS);
> +        Class normalizerClass = Class.forName(URLNORMALIZER_CLASS);
> +        normalizer = (UrlNormalizer)normalizerClass.newInstance();
> +      } catch (Exception e) {
> +        throw new RuntimeException("Couldn't create
> "+URLNORMALIZER_CLASS, e);
> +      }
> +    }
> +
> +    return normalizer;
> +
> +  }
> +
> +}
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/net/UrlNormalizer.java
> nutch-changed/src/java/net/nutch/net/UrlNormalizer.java
> --- nutch/src/java/net/nutch/net/UrlNormalizer.java   2004-04-23
> 15:32:33.000000000 -0400
> +++ nutch-changed/src/java/net/nutch/net/UrlNormalizer.java
> 2004-08-27 16:02:23.000000000 -0400
> @@ -3,69 +3,12 @@
>  
>  package net.nutch.net;
>  
> -import java.net.URL;
>  import java.net.MalformedURLException;
> -// import java.net.URI;
> -// import java.net.URISyntaxException;
>  
> -import java.util.logging.Logger;
> -import net.nutch.util.LogFormatter;
> -
> -/** Converts URLs to a normal form . */
> -public class UrlNormalizer {
> -  public static final Logger LOG =
> -    LogFormatter.getLogger("net.nutch.net.UrlNormalizer");
> -
> -  public static String normalize(String urlString)
> -    throws MalformedURLException {
> -
> -    if ("".equals(urlString))                     // permit empty
> -      return urlString;
> -
> -    urlString = urlString.trim();                 // remove extra
> spaces
> -
> -    URL url = new URL(urlString);
> -
> -    String protocol = url.getProtocol();
> -    String host = url.getHost();
> -    int port = url.getPort();
> -    String file = url.getFile();
> -
> -    boolean changed = false;
> -
> -    if (!urlString.startsWith(protocol))        // protocol was
> lowercased
> -      changed = true;
> -
> -    if ("http".equals(protocol) || "ftp".equals(protocol)) {
> -      
> -      if (host != null) {
> -        String newHost = host.toLowerCase();    // lowercase host
> -        if (!host.equals(newHost)) {
> -          host = newHost;
> -          changed = true;
> -        }
> -      }
> -
> -      if (port == url.getDefaultPort()) {       // uses default port
> -        port = -1;                              // so don't specify
> it
> -        changed = true;
> -      }
> -
> -      if (file == null || "".equals(file)) {    // add a slash
> -        file = "/";
> -        changed = true;
> -      }
> -
> -      if (url.getRef() != null) {                 // remove the ref
> -        changed = true;
> -      }
> -
> -    }
> -
> -    if (changed)
> -      urlString = new URL(protocol, host, port, file).toString();
> -
> -    return urlString;
> -  }
> +/** Interface used to convert URLs to normal form and optionally do
> regex substitutions */
> +public interface UrlNormalizer {
> +  
> +  /* Interface for URL normalization */
> +  public String normalize(String urlString) throws
> MalformedURLException;
>  
>  }
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/java/net/nutch/parse/Outlink.java
> nutch-changed/src/java/net/nutch/parse/Outlink.java
> --- nutch/src/java/net/nutch/parse/Outlink.java       2004-07-10
> 16:21:37.000000000 -0400
> +++ nutch-changed/src/java/net/nutch/parse/Outlink.java       2004-08-27
> 16:03:55.000000000 -0400
> @@ -7,7 +7,7 @@
>  import java.net.MalformedURLException;
>  
>  import net.nutch.io.*;
> -import net.nutch.net.UrlNormalizer;
> +import net.nutch.net.UrlNormalizerFactory;
>  
>  /* An outgoing link from a page. */
>  public class Outlink implements Writable {
> @@ -18,7 +18,7 @@
>    public Outlink() {}
>  
>    public Outlink(String toUrl, String anchor) throws
> MalformedURLException {
> -    this.toUrl = UrlNormalizer.normalize(toUrl);
> +    this.toUrl =
> UrlNormalizerFactory.getNormalizer().normalize(toUrl);
>      this.anchor = anchor;
>    }
>  
> diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html'
> --exclude='*.jar' --exclude='*.class'
> nutch/src/test/net/nutch/net/TestUrlNormalizer.java
> nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java
> --- nutch/src/test/net/nutch/net/TestUrlNormalizer.java       2004-04-23
> 15:32:34.000000000 -0400
> +++ nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java
> 2004-08-27 16:01:16.000000000 -0400
> @@ -36,7 +36,7 @@
>    }
>  
>    private void normalizeTest(String weird, String normal) throws
> Exception {
> -    assertEquals(normal, UrlNormalizer.normalize(weird));
> +    assertEquals(normal,
> UrlNormalizerFactory.getNormalizer().normalize(weird));
>    }
>       
>    public static void main(String[] args) throws Exception {
> 



-------------------------------------------------------
This SF.Net email is sponsored by BEA Weblogic Workshop
FREE Java Enterprise J2EE developer tools!
Get your free copy of BEA WebLogic Workshop 8.1 today.
http://ads.osdn.com/?ad_id=5047&alloc_id=10808&op=click
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

Reply via email to