Author: siren Date: Sat Mar 10 23:36:56 2007 New Revision: 516865 URL: http://svn.apache.org/viewvc?view=rev&rev=516865 Log: change urlnormalizer-regex to use regular expressions from jre
Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=516865&r1=516864&r2=516865 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Sat Mar 10 23:36:56 2007 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.net.urlnormalizer.regex; import java.net.URL; @@ -28,6 +27,7 @@ import java.util.List; import java.util.ArrayList; import java.util.Iterator; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,7 +40,6 @@ import javax.xml.parsers.*; import org.w3c.dom.*; -import org.apache.oro.text.regex.*; /** * Allows users to do regex substitutions on all/any URLs that are encountered, @@ -65,16 +64,14 @@ * string. */ private static class Rule { - public Perl5Pattern pattern; + public Pattern pattern; public String substitution; } - private HashMap scopedRules; + private HashMap<String, List<Rule>> scopedRules; - private static final List EMPTY_RULES = Collections.EMPTY_LIST; - - private PatternMatcher matcher = new Perl5Matcher(); + private static final List<Rule> EMPTY_RULES = Collections.EMPTY_LIST; /** * The default constructor which is called from UrlNormalizerFactory @@ -93,9 +90,9 @@ * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) - throws IOException, MalformedPatternException { + throws IOException { super(conf); - List rules = readConfigurationFile(filename); + List<Rule> rules = readConfigurationFile(filename); if (rules != null) scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); } @@ -106,9 +103,9 @@ // the default constructor was called if (this.scopedRules == null) { String filename = getConf().get("urlnormalizer.regex.file"); - scopedRules = new HashMap(); + scopedRules = new HashMap<String, List<Rule>>(); URL url = getConf().getResource(filename); - List rules = null; + List<Rule> rules = null; if (url == null) { LOG.warn("Can't load the default config file! " + filename); rules = EMPTY_RULES; @@ -126,7 +123,7 @@ // used in JUnit test. void setConfiguration(InputStream is, String scope) { - List rules = readConfiguration(is); + List<Rule> rules = readConfiguration(is); scopedRules.put(scope, rules); LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); } @@ -136,7 +133,7 @@ * patterns. It accepts a string url as input and returns the altered string. */ public synchronized String regexNormalize(String urlString, String scope) { - List curRules = (List)scopedRules.get(scope); + List<Rule> curRules = scopedRules.get(scope); if (curRules == null) { // try to populate String configFile = getConf().get("urlnormalizer.regex.file." + scope); @@ -147,7 +144,6 @@ LOG.warn("Can't load resource for config file: " + configFile); } else { try { - InputStream is = resource.openStream(); curRules = readConfiguration(resource.openStream()); scopedRules.put(scope, curRules); } catch (Exception e) { @@ -162,14 +158,11 @@ } if (curRules == EMPTY_RULES || curRules == null) { // use global rules - curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); + curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT); } - Iterator i = curRules.iterator(); - while (i.hasNext()) { - Rule r = (Rule) i.next(); - urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( - r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual - // substitution + + for (Rule rule: curRules) { + urlString = rule.pattern.matcher(urlString).replaceAll(rule.substitution); } return urlString; } @@ -180,7 +173,7 @@ } /** Reads the configuration file and populates a List of Rules. */ - private List readConfigurationFile(String filename) { + private List<Rule> readConfigurationFile(String filename) { if (LOG.isInfoEnabled()) { LOG.info("loading " + filename); } @@ -193,9 +186,8 @@ } } - private List readConfiguration(InputStream is) { - Perl5Compiler compiler = new Perl5Compiler(); - List rules = new ArrayList(); + private List<Rule> readConfiguration(InputStream is) { + List<Rule> rules = new ArrayList<Rule>(); try { // borrowed heavily from code in Configuration.java @@ -233,7 +225,7 @@ } if (patternValue != null && subValue != null) { Rule rule = new Rule(); - rule.pattern = (Perl5Pattern) compiler.compile(patternValue); + rule.pattern = Pattern.compile(patternValue); rule.substitution = subValue; rules.add(rule); } @@ -249,15 +241,14 @@ } /** Spits out patterns and substitutions that are in the configuration file. */ - public static void main(String args[]) throws MalformedPatternException, - IOException { + public static void main(String args[]) throws IOException { RegexURLNormalizer normalizer = new RegexURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator(); System.out.println("* Rules for 'DEFAULT' scope:"); while (i.hasNext()) { Rule r = (Rule) i.next(); - System.out.print(" " + r.pattern.getPattern() + " -> "); + System.out.print(" " + r.pattern.pattern() + " -> "); System.out.println(r.substitution); } // load the scope @@ -273,7 +264,7 @@ i = ((List)normalizer.scopedRules.get(scope)).iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); - System.out.print(" " + r.pattern.getPattern() + " -> "); + System.out.print(" " + r.pattern.pattern() + " -> "); System.out.println(r.substitution); } } ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs