Author: siren
Date: Sat Mar 10 23:36:56 2007
New Revision: 516865

URL: http://svn.apache.org/viewvc?view=rev&rev=516865
Log:
change urlnormalizer-regex to use regular expressions from jre

Modified:
    
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java

Modified: 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=516865&r1=516864&r2=516865
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
 Sat Mar 10 23:36:56 2007
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.regex;
 
 import java.net.URL;
@@ -28,6 +27,7 @@
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -40,7 +40,6 @@
 
 import javax.xml.parsers.*;
 import org.w3c.dom.*;
-import org.apache.oro.text.regex.*;
 
 /**
  * Allows users to do regex substitutions on all/any URLs that are encountered,
@@ -65,16 +64,14 @@
    * string.
    */
   private static class Rule {
-    public Perl5Pattern pattern;
+    public Pattern pattern;
 
     public String substitution;
   }
 
-  private HashMap scopedRules;
+  private HashMap<String, List<Rule>> scopedRules;
   
-  private static final List EMPTY_RULES = Collections.EMPTY_LIST;
-
-  private PatternMatcher matcher = new Perl5Matcher();
+  private static final List<Rule> EMPTY_RULES = Collections.EMPTY_LIST;
 
   /**
    * The default constructor which is called from UrlNormalizerFactory
@@ -93,9 +90,9 @@
    * configuration files for it.
    */
   public RegexURLNormalizer(Configuration conf, String filename)
-          throws IOException, MalformedPatternException {
+          throws IOException {
     super(conf);
-    List rules = readConfigurationFile(filename);
+    List<Rule> rules = readConfigurationFile(filename);
     if (rules != null)
       scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
   }
@@ -106,9 +103,9 @@
     // the default constructor was called
     if (this.scopedRules == null) {
       String filename = getConf().get("urlnormalizer.regex.file");
-      scopedRules = new HashMap();
+      scopedRules = new HashMap<String, List<Rule>>();
       URL url = getConf().getResource(filename);
-      List rules = null;
+      List<Rule> rules = null;
       if (url == null) {
         LOG.warn("Can't load the default config file! " + filename);
         rules = EMPTY_RULES;
@@ -126,7 +123,7 @@
 
   // used in JUnit test.
   void setConfiguration(InputStream is, String scope) {
-    List rules = readConfiguration(is);
+    List<Rule> rules = readConfiguration(is);
     scopedRules.put(scope, rules);
     LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " 
rules.");
   }
@@ -136,7 +133,7 @@
    * patterns. It accepts a string url as input and returns the altered string.
    */
   public synchronized String regexNormalize(String urlString, String scope) {
-    List curRules = (List)scopedRules.get(scope);
+    List<Rule> curRules = scopedRules.get(scope);
     if (curRules == null) {
       // try to populate
       String configFile = getConf().get("urlnormalizer.regex.file." + scope);
@@ -147,7 +144,6 @@
           LOG.warn("Can't load resource for config file: " + configFile);
         } else {
           try {
-            InputStream is = resource.openStream();
             curRules = readConfiguration(resource.openStream());
             scopedRules.put(scope, curRules);
           } catch (Exception e) {
@@ -162,14 +158,11 @@
     }
     if (curRules == EMPTY_RULES || curRules == null) {
       // use global rules
-      curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+      curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
     }
-    Iterator i = curRules.iterator();
-    while (i.hasNext()) {
-      Rule r = (Rule) i.next();
-      urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(
-              r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual
-                                                                // substitution
+    
+    for (Rule rule: curRules) {
+      urlString = 
rule.pattern.matcher(urlString).replaceAll(rule.substitution);
     }
     return urlString;
   }
@@ -180,7 +173,7 @@
   }
 
   /** Reads the configuration file and populates a List of Rules. */
-  private List readConfigurationFile(String filename) {
+  private List<Rule> readConfigurationFile(String filename) {
     if (LOG.isInfoEnabled()) {
       LOG.info("loading " + filename);
     }
@@ -193,9 +186,8 @@
     }
   }
   
-  private List readConfiguration(InputStream is) {
-    Perl5Compiler compiler = new Perl5Compiler();
-    List rules = new ArrayList();
+  private List<Rule> readConfiguration(InputStream is) {
+    List<Rule> rules = new ArrayList<Rule>();
     try {
 
       // borrowed heavily from code in Configuration.java
@@ -233,7 +225,7 @@
         }
         if (patternValue != null && subValue != null) {
           Rule rule = new Rule();
-          rule.pattern = (Perl5Pattern) compiler.compile(patternValue);
+          rule.pattern = Pattern.compile(patternValue);
           rule.substitution = subValue;
           rules.add(rule);
         }
@@ -249,15 +241,14 @@
   }
 
   /** Spits out patterns and substitutions that are in the configuration file. 
*/
-  public static void main(String args[]) throws MalformedPatternException,
-          IOException {
+  public static void main(String args[]) throws IOException {
     RegexURLNormalizer normalizer = new RegexURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
     Iterator i = 
((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
     System.out.println("* Rules for 'DEFAULT' scope:");
     while (i.hasNext()) {
       Rule r = (Rule) i.next();
-      System.out.print("  " + r.pattern.getPattern() + " -> ");
+      System.out.print("  " + r.pattern.pattern() + " -> ");
       System.out.println(r.substitution);
     }
     // load the scope
@@ -273,7 +264,7 @@
         i = ((List)normalizer.scopedRules.get(scope)).iterator();
         while (i.hasNext()) {
           Rule r = (Rule) i.next();
-          System.out.print("  " + r.pattern.getPattern() + " -> ");
+          System.out.print("  " + r.pattern.pattern() + " -> ");
           System.out.println(r.substitution);
         }
       }



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to