[Nutch-dev] RegexUrlNormalizer

Luke Baker Sat, 04 Sep 2004 12:40:08 -0700

Hi all,

I've finished up a patch to add a RegexUrlNormalizer class. I've left the default to just use the BasicUrlNormalizer class. The RegexUrlNormalizer class is useful for things like stripping out session IDs from URLs. To use the RegexUrlNormalizer class take a look at the values for urlnormalizer.class and urlnormalizer.regex.file in nutch-default.xml. The RegexUrlNormalizer class extends the BasicUrlNormalizer, and it does the basic normalization as well.

It'd be great if someone could take a look at this and commit it to CVS. Let me know if there are any questions or suggestions. Thanks a lot.

Luke

diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/build.xml nutch-changed/build.xml
--- nutch/build.xml	2004-08-31 13:52:03.000000000 -0400
+++ nutch-changed/build.xml	2004-09-04 15:13:44.000000000 -0400
@@ -156,6 +156,7 @@
     <junit printsummary="yes" haltonfailure="no" fork="yes" dir="${basedir}"
       errorProperty="tests.failed" failureProperty="tests.failed">
       <sysproperty key="test.build.data" value="${test.build.data}"/>
+      <sysproperty key="test.src.dir" value="${test.src.dir}"/>
       <classpath refid="test.classpath"/>
       <formatter type="plain" />
       <batchtest todir="${test.build.dir}" unless="testcase">
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/conf/nutch-default.xml nutch-changed/conf/nutch-default.xml
--- nutch/conf/nutch-default.xml	2004-09-02 18:41:15.000000000 -0400
+++ nutch-changed/conf/nutch-default.xml	2004-09-04 14:44:50.000000000 -0400
@@ -410,6 +410,19 @@
   expressions used by RegexURLFilter.</description>
 </property>
 
+<!-- URL normalizer properties -->
+                                                                                
+<property>
+  <name>urlnormalizer.class</name>
+  <value>net.nutch.net.BasicUrlNormalizer</value>
+  <description>Name of the class used to normalize URLs.</description>
+</property>
+
+<property>
+  <name>urlnormalizer.regex.file</name>
+  <value>regex-normalize.xml</value>
+  <description>Name of the config file used by the RegexUrlNormalizer class.</description></property>
+
 <!-- mime properties -->
 
 <property>
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/conf/regex-normalize.xml.template nutch-changed/conf/regex-normalize.xml.template
--- nutch/conf/regex-normalize.xml.template	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/conf/regex-normalize.xml.template	2004-09-04 15:17:42.000000000 -0400
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!-- The following rules show how to strip out session IDs 
+     that are 32 characters long and have the parameter 
+     name of PHPSESSID. Order does matter!  -->
+<regex-normalize>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}$</pattern>
+  <substitution></substitution>
+</regex>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}(\&amp;|\&amp;amp;)(.*)</pattern>
+  <substitution>$1$3</substitution>
+</regex>
+</regex-normalize>
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/db/Link.java nutch-changed/src/java/net/nutch/db/Link.java
--- nutch/src/java/net/nutch/db/Link.java	2003-05-21 12:25:10.000000000 -0400
+++ nutch-changed/src/java/net/nutch/db/Link.java	2004-09-04 14:45:46.000000000 -0400
@@ -9,7 +9,7 @@
 
 import net.nutch.io.*;
 import net.nutch.util.*;
-import net.nutch.net.UrlNormalizer;
+import net.nutch.net.UrlNormalizerFactory;
 
 /*********************************************
  * This is the field in the Link Database.
@@ -56,7 +56,7 @@
     public Link(MD5Hash fromID, long domainID, String urlString, String anchorText)
       throws MalformedURLException {
         this.fromID = fromID;
-        this.url = new UTF8(UrlNormalizer.normalize(urlString));
+        this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(urlString));
         this.domainID = domainID;
         
         // truncate long anchors
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/db/Page.java nutch-changed/src/java/net/nutch/db/Page.java
--- nutch/src/java/net/nutch/db/Page.java	2003-05-21 12:25:10.000000000 -0400
+++ nutch-changed/src/java/net/nutch/db/Page.java	2004-09-04 14:46:05.000000000 -0400
@@ -9,7 +9,7 @@
 
 import net.nutch.io.*;
 import net.nutch.util.*;
-import net.nutch.net.UrlNormalizer;
+import net.nutch.net.UrlNormalizerFactory;
 
 /*********************************************
  * A row in the Page Database.
@@ -181,7 +181,7 @@
   //
   public UTF8 getURL() { return url; }
   public void setURL(String url) throws MalformedURLException {
-    this.url = new UTF8(UrlNormalizer.normalize(url));
+    this.url = new UTF8(UrlNormalizerFactory.getNormalizer().normalize(url));
   }
 
   public MD5Hash getMD5() { return md5; }
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/BasicUrlNormalizer.java nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java
--- nutch/src/java/net/nutch/net/BasicUrlNormalizer.java	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/java/net/nutch/net/BasicUrlNormalizer.java	2004-09-04 14:48:58.000000000 -0400
@@ -0,0 +1,70 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.net;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+// import java.net.URI;
+// import java.net.URISyntaxException;
+
+import java.util.logging.Logger;
+import net.nutch.util.LogFormatter;
+
+/** Converts URLs to a normal form . */
+public class BasicUrlNormalizer implements UrlNormalizer {
+  public static final Logger LOG =
+    LogFormatter.getLogger("net.nutch.net.BasicUrlNormalizer");
+  
+  public String normalize(String urlString)
+    throws MalformedURLException {
+    if ("".equals(urlString))                     // permit empty
+      return urlString;
+
+    urlString = urlString.trim();                 // remove extra spaces
+
+    URL url = new URL(urlString);
+
+    String protocol = url.getProtocol();
+    String host = url.getHost();
+    int port = url.getPort();
+    String file = url.getFile();
+
+    boolean changed = false;
+
+    if (!urlString.startsWith(protocol))        // protocol was lowercased
+      changed = true;
+
+    if ("http".equals(protocol) || "ftp".equals(protocol)) {
+      
+      if (host != null) {
+        String newHost = host.toLowerCase();    // lowercase host
+        if (!host.equals(newHost)) {
+          host = newHost;
+          changed = true;
+        }
+      }
+
+      if (port == url.getDefaultPort()) {       // uses default port
+        port = -1;                              // so don't specify it
+        changed = true;
+      }
+
+      if (file == null || "".equals(file)) {    // add a slash
+        file = "/";
+        changed = true;
+      }
+
+      if (url.getRef() != null) {                 // remove the ref
+        changed = true;
+      }
+
+    }
+
+    if (changed)
+      urlString = new URL(protocol, host, port, file).toString();
+
+    return urlString;
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/RegexUrlNormalizer.java nutch-changed/src/java/net/nutch/net/RegexUrlNormalizer.java
--- nutch/src/java/net/nutch/net/RegexUrlNormalizer.java	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/java/net/nutch/net/RegexUrlNormalizer.java	2004-09-04 14:48:58.000000000 -0400
@@ -0,0 +1,155 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.net;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.io.IOException;
+// import java.net.URI;
+// import java.net.URISyntaxException;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.logging.Logger;
+import net.nutch.util.LogFormatter;
+
+import javax.xml.parsers.*;
+import org.w3c.dom.*;
+import org.apache.oro.text.regex.*;
+
+import net.nutch.util.*;
+
+/** Allows users to do regex substitutions on all/any URLs that are encountered, which
+ * is useful for stripping session IDs from URLs.
+ *
+ * <p>This class must be specified as the URL normalizer to be used in <tt>nutch-site.xml</tt>
+ * or <tt>nutch-default.xml</tt>.  To do this specify the <tt>urlnormalizer.class</tt> property to
+ * have the value:  <tt>net.nutch.net.RegexUrlNormalizer</tt>.  The <tt>urlnormalizer.regex.file</tt>
+ * property should also be set to the file name of an xml file which should contain the patterns
+ * and substitutions to be done on encountered URLs.</p>
+ *
+ * @author Luke Baker
+ */
+public class RegexUrlNormalizer extends BasicUrlNormalizer
+  implements UrlNormalizer {
+
+    /** Class which holds a compiled pattern and its corresponding substition string. */
+    private static class Rule {
+      public Perl5Pattern pattern;
+      public String substitution;	
+    }
+    
+    private List rules;
+    private PatternMatcher matcher = new Perl5Matcher();
+    
+    /** Default constructor which gets the file name from either <tt>nutch-site.xml</tt>
+      * or <tt>nutch-default.xml</tt> and reads that configuration file.  It stores the regex patterns
+      * and corresponding substitutions in a List. The file should be in the CLASSPATH. */
+    public RegexUrlNormalizer() throws IOException, MalformedPatternException {
+      String filename = NutchConf.get("urlnormalizer.regex.file");
+      URL url= NutchConf.class.getClassLoader().getResource(filename);
+     
+      rules=readConfigurationFile(url.toString());
+    }
+    
+    /** Constructor which can be passed the file name, so it doesn't look in the configuration files for it. */
+    public RegexUrlNormalizer(String filename)
+      throws IOException, MalformedPatternException {
+      //URL url= NutchConf.class.getClassLoader().getResource(filename);
+      rules = readConfigurationFile(filename);
+    }
+    
+    
+    /** This function does the replacements by iterating through all the regex patterns.
+      * It accepts a string url as input and returns the altered string. */
+    public synchronized String regexNormalize(String urlString) {
+      Iterator i=rules.iterator();
+      while(i.hasNext()) {
+        Rule r=(Rule) i.next();
+        urlString = Util.substitute(matcher, r.pattern, 
+          new Perl5Substitution(r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual substitution
+      }
+      return urlString;
+    }
+   
+    /** Normalizes any URLs by calling super.basicNormalize()
+      * and regexSub(). This is the function that gets called
+      * elsewhere in Nutch. */
+    public synchronized String normalize(String urlString)
+      throws MalformedURLException {
+        urlString = super.normalize(urlString); // run basicNormalize first to ready for regexNormalize
+        urlString = regexNormalize(urlString);
+        urlString = super.normalize(urlString); // make sure regexNormalize didn't screw up the URL
+        return urlString;
+  }
+  
+  
+  
+  /** Reads the configuration file and populates a List of Rules. */
+  private static List readConfigurationFile(String filename)
+    throws IOException, MalformedPatternException {
+
+    Perl5Compiler compiler=new Perl5Compiler();
+    List rules=new ArrayList();
+    try {
+      
+      LOG.info("loading " + filename);
+      // borrowed heavily from code in NutchConf.java
+      Document doc =
+        DocumentBuilderFactory.newInstance().newDocumentBuilder()
+        .parse(filename);
+      Element root = doc.getDocumentElement();
+      if (!"regex-normalize".equals(root.getTagName()))
+        LOG.severe("bad conf file: top-level element not <regex-normalize>");
+      NodeList regexes = root.getChildNodes();
+      for (int i = 0; i < regexes.getLength(); i++) {
+        Node regexNode = regexes.item(i);
+        if (!(regexNode instanceof Element))
+          continue;
+        Element regex = (Element)regexNode;
+        if (!"regex".equals(regex.getTagName()))
+          LOG.warning("bad conf file: element not <regex>");
+        NodeList fields = regex.getChildNodes();
+        String patternValue = null;
+        String subValue = null;
+        for (int j = 0; j < fields.getLength(); j++) {
+          Node fieldNode = fields.item(j);
+          if (!(fieldNode instanceof Element))
+            continue;
+          Element field = (Element)fieldNode;
+          if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
+            patternValue = ((Text)field.getFirstChild()).getData();
+          if ("substitution".equals(field.getTagName()) && field.hasChildNodes())
+            subValue = ((Text)field.getFirstChild()).getData();
+          if (!field.hasChildNodes())
+            subValue = "";
+        }
+        if (patternValue != null && subValue != null) {
+          Rule rule=new Rule();
+          rule.pattern=(Perl5Pattern) compiler.compile(patternValue);
+          rule.substitution=subValue;
+          rules.add(rule);
+        }
+      }
+        
+    } catch (Exception e) {
+      LOG.severe("error parsing " + filename +" conf file: " + e);
+    }
+    return rules;
+  }
+  
+  /** Spits out patterns and substitutions that are in the configuration file. */
+  public static void main(String args[])
+    throws MalformedPatternException, IOException {
+      RegexUrlNormalizer normalizer = new RegexUrlNormalizer();
+      Iterator i=normalizer.rules.iterator();
+      while(i.hasNext()) {
+        Rule r=(Rule) i.next();
+        System.out.print(r.pattern.getPattern() + "  ");
+        System.out.println(r.substitution);
+      }
+    }
+  
+}
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/UrlNormalizerFactory.java nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java
--- nutch/src/java/net/nutch/net/UrlNormalizerFactory.java	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/java/net/nutch/net/UrlNormalizerFactory.java	2004-09-04 14:48:58.000000000 -0400
@@ -0,0 +1,38 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.net;
+
+import net.nutch.util.*;
+import java.util.logging.*;
+
+/** Factory to create a UrlNormalizer from "urlnormalizer.class" config property. */
+public class UrlNormalizerFactory {
+  private static final Logger LOG =
+    LogFormatter.getLogger("net.nutch.net.UrlNormalizerFactory");
+
+  private static final String URLNORMALIZER_CLASS =
+    NutchConf.get("urlnormalizer.class");
+
+  private UrlNormalizerFactory() {}                   // no public ctor
+
+  private static UrlNormalizer normalizer;
+
+  /** Return the default UrlNormalizer implementation. */
+  public static UrlNormalizer getNormalizer() {
+
+    if (normalizer == null) {
+      try {
+        LOG.info("Using URL normalizer: " + URLNORMALIZER_CLASS);
+        Class normalizerClass = Class.forName(URLNORMALIZER_CLASS);
+        normalizer = (UrlNormalizer)normalizerClass.newInstance();
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create "+URLNORMALIZER_CLASS, e);
+      }
+    }
+
+    return normalizer;
+
+  }
+
+}
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/net/UrlNormalizer.java nutch-changed/src/java/net/nutch/net/UrlNormalizer.java
--- nutch/src/java/net/nutch/net/UrlNormalizer.java	2004-04-23 15:32:33.000000000 -0400
+++ nutch-changed/src/java/net/nutch/net/UrlNormalizer.java	2004-09-04 14:48:58.000000000 -0400
@@ -3,69 +3,12 @@
 
 package net.nutch.net;
 
-import java.net.URL;
 import java.net.MalformedURLException;
-// import java.net.URI;
-// import java.net.URISyntaxException;
 
-import java.util.logging.Logger;
-import net.nutch.util.LogFormatter;
-
-/** Converts URLs to a normal form . */
-public class UrlNormalizer {
-  public static final Logger LOG =
-    LogFormatter.getLogger("net.nutch.net.UrlNormalizer");
-
-  public static String normalize(String urlString)
-    throws MalformedURLException {
-
-    if ("".equals(urlString))                     // permit empty
-      return urlString;
-
-    urlString = urlString.trim();                 // remove extra spaces
-
-    URL url = new URL(urlString);
-
-    String protocol = url.getProtocol();
-    String host = url.getHost();
-    int port = url.getPort();
-    String file = url.getFile();
-
-    boolean changed = false;
-
-    if (!urlString.startsWith(protocol))        // protocol was lowercased
-      changed = true;
-
-    if ("http".equals(protocol) || "ftp".equals(protocol)) {
-      
-      if (host != null) {
-        String newHost = host.toLowerCase();    // lowercase host
-        if (!host.equals(newHost)) {
-          host = newHost;
-          changed = true;
-        }
-      }
-
-      if (port == url.getDefaultPort()) {       // uses default port
-        port = -1;                              // so don't specify it
-        changed = true;
-      }
-
-      if (file == null || "".equals(file)) {    // add a slash
-        file = "/";
-        changed = true;
-      }
-
-      if (url.getRef() != null) {                 // remove the ref
-        changed = true;
-      }
-
-    }
-
-    if (changed)
-      urlString = new URL(protocol, host, port, file).toString();
-
-    return urlString;
-  }
+/** Interface used to convert URLs to normal form and optionally do regex substitutions */
+public interface UrlNormalizer {
+  
+  /* Interface for URL normalization */
+  public String normalize(String urlString) throws MalformedURLException;
 
 }
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/java/net/nutch/parse/Outlink.java nutch-changed/src/java/net/nutch/parse/Outlink.java
--- nutch/src/java/net/nutch/parse/Outlink.java	2004-07-10 16:21:37.000000000 -0400
+++ nutch-changed/src/java/net/nutch/parse/Outlink.java	2004-09-04 14:48:19.000000000 -0400
@@ -7,7 +7,7 @@
 import java.net.MalformedURLException;
 
 import net.nutch.io.*;
-import net.nutch.net.UrlNormalizer;
+import net.nutch.net.UrlNormalizerFactory;
 
 /* An outgoing link from a page. */
 public class Outlink implements Writable {
@@ -18,7 +18,7 @@
   public Outlink() {}
 
   public Outlink(String toUrl, String anchor) throws MalformedURLException {
-    this.toUrl = UrlNormalizer.normalize(toUrl);
+    this.toUrl = UrlNormalizerFactory.getNormalizer().normalize(toUrl);
     this.anchor = anchor;
   }
 
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestBasicUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestBasicUrlNormalizer.java
--- nutch/src/test/net/nutch/net/TestBasicUrlNormalizer.java	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/test/net/nutch/net/TestBasicUrlNormalizer.java	2004-09-04 14:49:33.000000000 -0400
@@ -0,0 +1,48 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.net;
+
+import java.net.URL;
+import junit.framework.TestCase;
+
+/** Unit tests for BasicUrlNormalizer. */
+public class TestBasicUrlNormalizer extends TestCase {
+  public TestBasicUrlNormalizer(String name) { super(name); }
+
+  public void testNormalizer() throws Exception {
+    // check that leading and trailing spaces are removed
+    normalizeTest(" http://foo.com/ ", "http://foo.com/";);
+
+    // check that protocol is lower cased
+    normalizeTest("HTTP://foo.com/";, "http://foo.com/";);
+
+    // check that host is lower cased
+    normalizeTest("http://Foo.Com/index.html";, "http://foo.com/index.html";);
+    normalizeTest("http://Foo.Com/index.html";, "http://foo.com/index.html";);
+
+    // check that port number is normalized
+    normalizeTest("http://foo.com:80/index.html";, "http://foo.com/index.html";);
+    normalizeTest("http://foo.com:81/";, "http://foo.com:81/";);
+
+    // check that null path is normalized
+    normalizeTest("http://foo.com";, "http://foo.com/";);
+
+    // check that references are removed
+    normalizeTest("http://foo.com/foo.html#ref";, "http://foo.com/foo.html";);
+
+//     // check that encoding is normalized
+//     normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
+  }
+
+  private void normalizeTest(String weird, String normal) throws Exception {
+    assertEquals(normal, UrlNormalizerFactory.getNormalizer().normalize(weird));
+  }
+	
+  public static void main(String[] args) throws Exception {
+    new TestBasicUrlNormalizer("test").testNormalizer();
+  }
+
+
+
+}
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/test-regex-normalize.xml nutch-changed/src/test/net/nutch/net/test-regex-normalize.xml
--- nutch/src/test/net/nutch/net/test-regex-normalize.xml	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/test/net/nutch/net/test-regex-normalize.xml	2004-09-04 14:49:33.000000000 -0400
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!-- This is the configuration file for the RegexUrlNormalize Class.
+     This is intended so that users can specify substitutions to be
+     done on URLs. The regex engine that is used is Perl5 compatible.
+     The rules are applied to URLs in the order they occur in this file.  -->
+
+<!-- WATCH OUT: an xml parser reads this file an ampersands must be
+     expanded to &amp; -->
+
+<!-- The following rules show how to strip out session IDs 
+     that are 32 characters long and have the parameter 
+     name of PHPSESSID. Order does matter!  -->
+<regex-normalize>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}$</pattern>
+  <substitution></substitution>
+</regex>
+<regex>
+  <pattern>(\?|\&amp;|\&amp;amp;)PHPSESSID=[a-zA-Z0-9]{32}(\&amp;|\&amp;amp;)(.*)</pattern>
+  <substitution>$1$3</substitution>
+</regex>
+</regex-normalize>
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestRegexUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestRegexUrlNormalizer.java
--- nutch/src/test/net/nutch/net/TestRegexUrlNormalizer.java	1969-12-31 19:00:00.000000000 -0500
+++ nutch-changed/src/test/net/nutch/net/TestRegexUrlNormalizer.java	2004-09-04 14:49:33.000000000 -0400
@@ -0,0 +1,39 @@
+/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
+/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
+
+package net.nutch.net;
+
+import java.net.URL;
+import junit.framework.TestCase;
+import net.nutch.net.RegexUrlNormalizer;
+
+/** Unit tests for RegexUrlNormalizer. */
+public class TestRegexUrlNormalizer extends TestBasicUrlNormalizer {
+  public TestRegexUrlNormalizer(String name) { super(name); }
+
+  public void testNormalizer() throws Exception {
+    normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03";,
+      "http://foo.com/foo.php?f=2";);
+    normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3";,
+      "http://foo.com/foo.php?f=2&q=3";);
+    normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2";,
+      "http://foo.com/foo.php?f=2";);
+    normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03";,
+      "http://foo.com/foo.php";);
+  }
+
+  private void normalizeTest(String weird, String normal) throws Exception {
+    String testSrcDir = System.getProperty("test.src.dir");
+    String path = testSrcDir + "/net/nutch/net/test-regex-normalize.xml";
+    RegexUrlNormalizer normalizer = new RegexUrlNormalizer(path);
+    assertEquals(normal, normalizer.normalize(weird));
+  }
+	
+  public static void main(String[] args) throws Exception {
+    new TestRegexUrlNormalizer("test").testNormalizer();
+    new TestBasicUrlNormalizer("test").testNormalizer(); // need to make sure it passes this test too
+  }
+
+
+
+}
diff -Nur --exclude='*.txt' --exclude='*-site.xml' --exclude='*.html' --exclude='*.jar' --exclude='*.class' --exclude='*CVS*' --exclude='*Pdf*' nutch/src/test/net/nutch/net/TestUrlNormalizer.java nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java
--- nutch/src/test/net/nutch/net/TestUrlNormalizer.java	2004-04-23 15:32:34.000000000 -0400
+++ nutch-changed/src/test/net/nutch/net/TestUrlNormalizer.java	1969-12-31 19:00:00.000000000 -0500
@@ -1,48 +0,0 @@
-/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
-/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
-
-package net.nutch.net;
-
-import java.net.URL;
-import junit.framework.TestCase;
-
-/** Unit tests for UrlNormalizer. */
-public class TestUrlNormalizer extends TestCase {
-  public TestUrlNormalizer(String name) { super(name); }
-
-  public void testNormalizer() throws Exception {
-    // check that leading and trailing spaces are removed
-    normalizeTest(" http://foo.com/ ", "http://foo.com/";);
-
-    // check that protocol is lower cased
-    normalizeTest("HTTP://foo.com/";, "http://foo.com/";);
-
-    // check that host is lower cased
-    normalizeTest("http://Foo.Com/index.html";, "http://foo.com/index.html";);
-    normalizeTest("http://Foo.Com/index.html";, "http://foo.com/index.html";);
-
-    // check that port number is normalized
-    normalizeTest("http://foo.com:80/index.html";, "http://foo.com/index.html";);
-    normalizeTest("http://foo.com:81/";, "http://foo.com:81/";);
-
-    // check that null path is normalized
-    normalizeTest("http://foo.com";, "http://foo.com/";);
-
-    // check that references are removed
-    normalizeTest("http://foo.com/foo.html#ref";, "http://foo.com/foo.html";);
-
-//     // check that encoding is normalized
-//     normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
-  }
-
-  private void normalizeTest(String weird, String normal) throws Exception {
-    assertEquals(normal, UrlNormalizer.normalize(weird));
-  }
-	
-  public static void main(String[] args) throws Exception {
-    new TestUrlNormalizer("test").testNormalizer();
-  }
-
-
-
-}

[Nutch-dev] RegexUrlNormalizer

Reply via email to